Source code for pipeline.utils.loaderutils.preprocessing

"""A module that defines utilities used to handle the Pandas Dataframe laoded from
CSV-like files.
"""
import os.path as op
import typing

from tqdm.auto import tqdm
import pandas as pd


[docs]def cast_boolean_columns(particles: pd.DataFrame):
    """Cast the columns of the ``particles`` dataframe as boolean columns. In-place.

    Args:
        particles: dataframe of particles
    """
    boolean_columns = list(
        set(particles.columns).intersection(
            set(
                [
                    "has_velo",
                    "has_ut",
                    "has_scifi",
                    "from_bdecay",
                    "from_cdecay",
                    "from_sdecay",
                ]
            )
        )
    )
    particles[boolean_columns] = particles[boolean_columns].astype(dtype="bool")


[docs]def combine_run_event_into_event_id(dataframe: pd.DataFrame):
    dataframe["event_id"] = dataframe["event"].astype("int64") + dataframe[
        "run"
    ].astype("int64") * (10**9)


[docs]def load_dataframes(
    indir: str,
    hits_particles_filename: str | None = None,
    particles_filename: str | None = None,
    hits_particles_columns: typing.List[str] | None = None,
    particles_columns: typing.List[str] | None = None,
    use_run_number: bool = True,
    **kwargs,
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Load the dataframes of hits_particles and particles that are stored in a folder.
    This function is also used in the validation step.

    Args:
        indir: directory where the dataframes are saved
        hits_particles_filename: Name of the hits-particles file name
            (without the ``.parquet.lz4`` extension). Default is ``hits_velo``.
        particles_filename: Name of the particle file name
            (without the ``.parquet.lz4`` extension). Default is ``mc_particles``.
        hits_particles_columns: columns to load for the dataframe of hits
            and the hits-particles association information
        particles_columns: columns to load for the dataframe of particles
        run_number: whether to define the event ID (``event_id`` column) as
            ``event + (10**9) * run`` instead of just ``event``.
        **kwargs: other keyword arguments passed to the function that load the files

    Returns:
        A 2-tuple containing the dataframe of hits-particles and the dataframes
        of particles

    Notes:
        The function also defines the column ``particle_id = mcid + 1``
        in both dataframes.
    """
    if hits_particles_filename is None:
        hits_particles_filename = "hits_velo"

    if particles_filename is None:
        particles_filename = "mc_particles"

    particles = pd.read_parquet(
        path=op.join(indir, particles_filename + ".parquet.lz4"),
        columns=(
            None
            if particles_columns is None
            else ["run", "event", "mcid"] + particles_columns
        ),
        **kwargs,
    )
    hits_particles = pd.read_parquet(
        path=op.join(indir, hits_particles_filename + ".parquet.lz4"),
        columns=(
            None
            if hits_particles_columns is None
            else ["run", "event", "mcid", "lhcbid"] + hits_particles_columns
        ),
        **kwargs,
    )

    cast_boolean_columns(particles)
    if use_run_number:
        for df in (hits_particles, particles):
            combine_run_event_into_event_id(dataframe=df)
    else:
        for df in (hits_particles, particles):
            df["event_id"] = df["event"].copy()

    # Define `particle_id = mcid + 1` directly in the original dataframes
    particles["particle_id"] = particles["mcid"] + 1
    hits_particles["particle_id"] = hits_particles["mcid"] + 1

    particles.drop("mcid", axis=1, inplace=True)
    hits_particles.drop("mcid", axis=1, inplace=True)

    # Rename `lhcbid` to `hit_id`
    hits_particles.rename(columns={"lhcbid": "hit_id"}, inplace=True)

    return hits_particles, particles


[docs]def load_preprocessed_dataframes(
    truncated_paths: typing.List[str],
    ending: str,
    **kwargs,
) -> pd.DataFrame:
    """Load dataframes stored in parquet files, whose paths are in the form
    ``{truncated_path}{ending}.parquet`` where the truncated path ends with
    9 numbers corresponding to the event ID.

    Args:
        truncated_paths: list of truncated paths, without ``ending`` and
            the extension ``.parquet``
        ending: ending of the file, excluding the extension ``.parquet``
        **kwargs: passed to :py:func:`pandas.read_parquet`

    Returns:
        Dataframe, where the ``event_id`` was also added.
    """
    list_dataframes = []
    for truncated_path in tqdm(truncated_paths):
        event_id = int(op.basename(truncated_path)[len("event") :])
        dataframe = pd.read_parquet(f"{truncated_path}{ending}.parquet", **kwargs)
        dataframe["event_id"] = event_id
        list_dataframes.append(dataframe)

    return pd.concat(list_dataframes)