Source code for pipeline.utils.loaderutils.preprocessing

"""A module that defines utilities used to handle the Pandas Dataframe laoded from
CSV-like files.
"""
import os.path as op
import typing

from tqdm.auto import tqdm
import pandas as pd


[docs]def cast_boolean_columns(particles: pd.DataFrame): """Cast the columns of the ``particles`` dataframe as boolean columns. In-place. Args: particles: dataframe of particles """ boolean_columns = list( set(particles.columns).intersection( set( [ "has_velo", "has_ut", "has_scifi", "from_bdecay", "from_cdecay", "from_sdecay", ] ) ) ) particles[boolean_columns] = particles[boolean_columns].astype(dtype="bool")
[docs]def combine_run_event_into_event_id(dataframe: pd.DataFrame): dataframe["event_id"] = dataframe["event"].astype("int64") + dataframe[ "run" ].astype("int64") * (10**9)
[docs]def load_dataframes( indir: str, hits_particles_filename: str | None = None, particles_filename: str | None = None, hits_particles_columns: typing.List[str] | None = None, particles_columns: typing.List[str] | None = None, use_run_number: bool = True, **kwargs, ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """Load the dataframes of hits_particles and particles that are stored in a folder. This function is also used in the validation step. Args: indir: directory where the dataframes are saved hits_particles_filename: Name of the hits-particles file name (without the ``.parquet.lz4`` extension). Default is ``hits_velo``. particles_filename: Name of the particle file name (without the ``.parquet.lz4`` extension). Default is ``mc_particles``. hits_particles_columns: columns to load for the dataframe of hits and the hits-particles association information particles_columns: columns to load for the dataframe of particles run_number: whether to define the event ID (``event_id`` column) as ``event + (10**9) * run`` instead of just ``event``. **kwargs: other keyword arguments passed to the function that load the files Returns: A 2-tuple containing the dataframe of hits-particles and the dataframes of particles Notes: The function also defines the column ``particle_id = mcid + 1`` in both dataframes. """ if hits_particles_filename is None: hits_particles_filename = "hits_velo" if particles_filename is None: particles_filename = "mc_particles" particles = pd.read_parquet( path=op.join(indir, particles_filename + ".parquet.lz4"), columns=( None if particles_columns is None else ["run", "event", "mcid"] + particles_columns ), **kwargs, ) hits_particles = pd.read_parquet( path=op.join(indir, hits_particles_filename + ".parquet.lz4"), columns=( None if hits_particles_columns is None else ["run", "event", "mcid", "lhcbid"] + hits_particles_columns ), **kwargs, ) cast_boolean_columns(particles) if use_run_number: for df in (hits_particles, particles): combine_run_event_into_event_id(dataframe=df) else: for df in (hits_particles, particles): df["event_id"] = df["event"].copy() # Define `particle_id = mcid + 1` directly in the original dataframes particles["particle_id"] = particles["mcid"] + 1 hits_particles["particle_id"] = hits_particles["mcid"] + 1 particles.drop("mcid", axis=1, inplace=True) hits_particles.drop("mcid", axis=1, inplace=True) # Rename `lhcbid` to `hit_id` hits_particles.rename(columns={"lhcbid": "hit_id"}, inplace=True) return hits_particles, particles
[docs]def load_preprocessed_dataframes( truncated_paths: typing.List[str], ending: str, **kwargs, ) -> pd.DataFrame: """Load dataframes stored in parquet files, whose paths are in the form ``{truncated_path}{ending}.parquet`` where the truncated path ends with 9 numbers corresponding to the event ID. Args: truncated_paths: list of truncated paths, without ``ending`` and the extension ``.parquet`` ending: ending of the file, excluding the extension ``.parquet`` **kwargs: passed to :py:func:`pandas.read_parquet` Returns: Dataframe, where the ``event_id`` was also added. """ list_dataframes = [] for truncated_path in tqdm(truncated_paths): event_id = int(op.basename(truncated_path)[len("event") :]) dataframe = pd.read_parquet(f"{truncated_path}{ending}.parquet", **kwargs) dataframe["event_id"] = event_id list_dataframes.append(dataframe) return pd.concat(list_dataframes)