Source code for pipeline.utils.loaderutils.preprocessing
"""A module that defines utilities used to handle the Pandas Dataframe laoded from
CSV-like files.
"""
import os.path as op
import typing
from tqdm.auto import tqdm
import pandas as pd
[docs]def cast_boolean_columns(particles: pd.DataFrame):
"""Cast the columns of the ``particles`` dataframe as boolean columns. In-place.
Args:
particles: dataframe of particles
"""
boolean_columns = list(
set(particles.columns).intersection(
set(
[
"has_velo",
"has_ut",
"has_scifi",
"from_bdecay",
"from_cdecay",
"from_sdecay",
]
)
)
)
particles[boolean_columns] = particles[boolean_columns].astype(dtype="bool")
[docs]def combine_run_event_into_event_id(dataframe: pd.DataFrame):
dataframe["event_id"] = dataframe["event"].astype("int64") + dataframe[
"run"
].astype("int64") * (10**9)
[docs]def load_dataframes(
indir: str,
hits_particles_filename: str | None = None,
particles_filename: str | None = None,
hits_particles_columns: typing.List[str] | None = None,
particles_columns: typing.List[str] | None = None,
use_run_number: bool = True,
**kwargs,
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""Load the dataframes of hits_particles and particles that are stored in a folder.
This function is also used in the validation step.
Args:
indir: directory where the dataframes are saved
hits_particles_filename: Name of the hits-particles file name
(without the ``.parquet.lz4`` extension). Default is ``hits_velo``.
particles_filename: Name of the particle file name
(without the ``.parquet.lz4`` extension). Default is ``mc_particles``.
hits_particles_columns: columns to load for the dataframe of hits
and the hits-particles association information
particles_columns: columns to load for the dataframe of particles
run_number: whether to define the event ID (``event_id`` column) as
``event + (10**9) * run`` instead of just ``event``.
**kwargs: other keyword arguments passed to the function that load the files
Returns:
A 2-tuple containing the dataframe of hits-particles and the dataframes
of particles
Notes:
The function also defines the column ``particle_id = mcid + 1``
in both dataframes.
"""
if hits_particles_filename is None:
hits_particles_filename = "hits_velo"
if particles_filename is None:
particles_filename = "mc_particles"
particles = pd.read_parquet(
path=op.join(indir, particles_filename + ".parquet.lz4"),
columns=(
None
if particles_columns is None
else ["run", "event", "mcid"] + particles_columns
),
**kwargs,
)
hits_particles = pd.read_parquet(
path=op.join(indir, hits_particles_filename + ".parquet.lz4"),
columns=(
None
if hits_particles_columns is None
else ["run", "event", "mcid", "lhcbid"] + hits_particles_columns
),
**kwargs,
)
cast_boolean_columns(particles)
if use_run_number:
for df in (hits_particles, particles):
combine_run_event_into_event_id(dataframe=df)
else:
for df in (hits_particles, particles):
df["event_id"] = df["event"].copy()
# Define `particle_id = mcid + 1` directly in the original dataframes
particles["particle_id"] = particles["mcid"] + 1
hits_particles["particle_id"] = hits_particles["mcid"] + 1
particles.drop("mcid", axis=1, inplace=True)
hits_particles.drop("mcid", axis=1, inplace=True)
# Rename `lhcbid` to `hit_id`
hits_particles.rename(columns={"lhcbid": "hit_id"}, inplace=True)
return hits_particles, particles
[docs]def load_preprocessed_dataframes(
truncated_paths: typing.List[str],
ending: str,
**kwargs,
) -> pd.DataFrame:
"""Load dataframes stored in parquet files, whose paths are in the form
``{truncated_path}{ending}.parquet`` where the truncated path ends with
9 numbers corresponding to the event ID.
Args:
truncated_paths: list of truncated paths, without ``ending`` and
the extension ``.parquet``
ending: ending of the file, excluding the extension ``.parquet``
**kwargs: passed to :py:func:`pandas.read_parquet`
Returns:
Dataframe, where the ``event_id`` was also added.
"""
list_dataframes = []
for truncated_path in tqdm(truncated_paths):
event_id = int(op.basename(truncated_path)[len("event") :])
dataframe = pd.read_parquet(f"{truncated_path}{ending}.parquet", **kwargs)
dataframe["event_id"] = event_id
list_dataframes.append(dataframe)
return pd.concat(list_dataframes)