Source code for pipeline.Preprocessing.process_custom

import typing
import logging
import numpy as np
import pandas as pd
from .particle_fitting_metrics import compute_particle_line_metrics_dataframe
from .hit_filtering import mask_long_into_small_tracks
from utils.commonutils.cdetector import get_coordinate_names


[docs]class SelectionFunction(typing.Protocol): def __call__( self, hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: ...
[docs]def apply_mask( particles_mask: pd.Series, particles: pd.DataFrame, hits_particles: pd.DataFrame, ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """Apply a mask of particles to keep to both the dataframe of particles and the dataframe of particles-hits. Args: particles_mask: The mask to apply, that corresponds to the particles to keep in the dataframe ``particles`` particles: Dataframe of particles hits_particles: Dataframe of hits-particles association Returns: Dataframe of ``particles`` and ``hits_particles`` that only contain the particles and hits to keep. """ # About 3 seconds in a dataframe with 5000 events hits_particles_mask = ( hits_particles[["event_id", "particle_id"]] .reset_index() .merge( right=particles[["event_id", "particle_id"]].assign(mask_=particles_mask), on=["event_id", "particle_id"], how="left", sort=False, ) .set_index("index")["mask_"] .fillna(True) ) # fillna to keep fake hits return particles[particles_mask], hits_particles[hits_particles_mask]
[docs]def only_keep_hits_on_particles(hits_particles: pd.DataFrame, particles: pd.DataFrame): return hits_particles.merge( particles[["event_id", "particle_id"]], how="inner", on=["event_id", "particle_id"], )
[docs]def only_long_electrons( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """Only keep long electrons. Args: hits_particles: Dataframe of hits-particles association particles: Dataframe of particles Returns: Dataframe of hits-particles association and particles, filtered so that only long electrons are left. """ # 1. Create a mask of the particles to keep: mask_particles_to_keep = ( particles["has_velo"] & particles["has_scifi"] & (particles["pid"].abs() == 11) ) # 2. Propagate the mask to the dataframe of `particles` and `hits_particles` particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) return hits_particles, particles
[docs]def everything_but_long_electrons( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """Only keep long electrons. Args: hits_particles: Dataframe of hits-particles association particles: Dataframe of particles Returns: Dataframe of hits-particles association and particles, filtered so that only long electrons are left. """ # 1. Create a mask of the particles to keep: mask_particles_to_keep = particles["has_velo"] & ~( particles["has_scifi"] & (particles["pid"].abs() == 11) ) # 2. Propagate the mask to the dataframe of `particles` and `hits_particles` particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) return hits_particles, particles
[docs]def default_old_training_for_rta_presentation( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """Selection that was used in the training presented in the RTA meeting.""" # Drop duplicates hits hits_particles = hits_particles.drop_duplicates( subset=["event_id", "particle_id", "plane"], keep="first" ) # Remove fake hits (there shouldn't be any already) hits_particles = hits_particles[hits_particles["particle_id"] != 0] # Compute distance to line and add it to the dataframe of particles logging.info("Compute distance to line (that might take some time)") new_distances = compute_particle_line_metrics_dataframe( hits=hits_particles, metric_names=["distance_to_line"], ) particles = particles.merge( new_distances, how="left", on=["event_id", "particle_id"] ) # Only keep reconstructible particles that are straight enough logging.info("Apply particle selection mask") mask_particles_to_keep = ( (particles["has_velo"] == 1) & (particles["nhits_velo"] >= 3) & (particles["distance_to_line"] < np.sqrt(0.6)) ) particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) # assert that there is not any nan values at this point assert not particles.isna().any().any() assert not hits_particles.isna().any().any() return hits_particles, particles
[docs]def everything_but_electrons( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """Only keep long electrons. Args: hits_particles: Dataframe of hits-particles association particles: Dataframe of particles Returns: Dataframe of hits-particles association and particles, filtered so that only long electrons are left. """ # 1. Create a mask of the particles to keep: mask_particles_to_keep = particles["has_velo"] & (particles["pid"].abs() != 11) # 2. Propagate the mask to the dataframe of `particles` and `hits_particles` particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) return hits_particles, particles
[docs]def track_weighting_selection( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """The selection performed in the ``track-weighting`` experiment.""" # Only keep reconstructible particles that are straight enough # Also remove the hits to avoid splitted tracks # (we will counter-balance by requiring enough clusters / event) logging.info("Compute distance to line (that might take some time)") new_distances = compute_particle_line_metrics_dataframe( hits=hits_particles, metric_names=["distance_to_line"] ) particles = particles.merge( new_distances, how="left", on=["event_id", "particle_id"] ) mask_particles_to_keep = particles["distance_to_line"] < 0.8 particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) # assert that there is not any nan values at this point assert not particles.isna().any().any() assert not hits_particles.isna().any().any() return hits_particles, particles
[docs]def triplets_first_selection( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """The selection performed in the ``triplets-edge`` experiment.""" # Only keep one particles-hits association (drop duplicates) hits_particles = hits_particles.drop_duplicates( subset=["event_id", "hit_id"], keep="first" ) # Only keep reconstructible particles that are straight enough # Also remove the hits to avoid splitted tracks # (we will counter-balance by requiring enough clusters / event) logging.info("Compute distance to line (that might take some time)") new_distances = compute_particle_line_metrics_dataframe( hits=hits_particles, metric_names=["distance_to_line"] ) particles = particles.merge( new_distances, how="left", on=["event_id", "particle_id"] ) mask_particles_to_keep = particles["distance_to_line"] < 0.8 particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) # assert that there is not any nan values at this point assert not particles.isna().any().any() assert not hits_particles.isna().any() return hits_particles, particles
[docs]def remove_curved_particles( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """Remove curved particles""" # Only keep reconstructible particles that are straight enough # Also remove the hits to avoid splitted tracks # (we will counter-balance by requiring enough clusters / event) logging.info("Compute distance to line (that might take some time)") new_distances = compute_particle_line_metrics_dataframe( hits=hits_particles, metric_names=["distance_to_line"] ) particles = particles.merge( new_distances, how="left", on=["event_id", "particle_id"] ) mask_particles_to_keep = particles["distance_to_line"] < 0.8 particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) # assert that there is not any nan values at this point assert not particles.isna().any().any() assert not hits_particles.isna().any().any() return hits_particles, particles
[docs]def compute_n_unique_planes( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """Compute number of unique planes for each particle.""" # We'll train with all the hits for the training # And cut before the GNN n_unique_planes = ( hits_particles.groupby(["event_id", "particle_id"])["plane"] .nunique() .rename("n_unique_planes") ) particles = particles.merge( n_unique_planes, how="left", on=["event_id", "particle_id"] ).fillna(0) return hits_particles, particles
[docs]def compute_n_particles_per_hit( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: """Compute number of unique planes for each particle.""" n_particles_per_hit = ( hits_particles.groupby(["event_id", "hit_id"])["particle_id"] .count() .rename("n_particles_per_hit") ) hits_particles = hits_particles.merge( n_particles_per_hit, on=["event_id", "hit_id"], how="left", ) return hits_particles, particles
[docs]def cut_long_tracks( hits_particles: pd.DataFrame, particles: pd.DataFrame, ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: # Get tracks with more than 7 hits if "nhits_velo" not in hits_particles: hits_particles = hits_particles.merge( particles[["event_id", "particle_id", "nhits_velo"]], how="left", on=["event_id", "particle_id"], ) mask_long = hits_particles["nhits_velo"] >= 7 hits_particles_long = hits_particles[mask_long] hits_particles_short = hits_particles[~mask_long] # cut long tracks to 3, 4, 5 or 6 hits, with many more with 3 or 4 hits mask_long_into_small = mask_long_into_small_tracks( hits_particles=hits_particles_long, track_size_proportions={ 3: 0.50, 4: 0.30, 5: 0.10, 6: 0.02, -1: 0.08, # 8 % untouched }, ) hits_particles = pd.concat( (hits_particles_short, hits_particles_long[mask_long_into_small]) ) return hits_particles, particles
[docs]def reconstructible_scifi(hits_particles: pd.DataFrame, particles: pd.DataFrame): mask_particles_to_keep = particles["has_scifi"] particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) return hits_particles, particles
[docs]def at_least_7_planes( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: plane_groups = hits_particles.groupby(["event_id", "particle_id"])["plane"] diff_planes = plane_groups.max() - plane_groups.min() particles = particles.merge( diff_planes.rename("plane_diff").reset_index(), how="left", on=["event_id", "particle_id"], ) mask_particles_to_keep = particles["plane_diff"] >= 7 particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) return hits_particles, particles
[docs]def at_least_1_hit_on_scifi(hits_particles: pd.DataFrame, particles: pd.DataFrame): particles = particles[particles["nhits_scifi"] >= 1] return hits_particles, particles
[docs]def less_than_3_hits_on_same_plane( hits_particles: pd.DataFrame, particles: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: particles = particles.merge( hits_particles.groupby(["event_id", "particle_id", "plane"])["hit_id"] .count() .groupby(["event_id", "particle_id"]) .max() .rename("max_n_hits_per_plane") .reset_index(), on=["event_id", "particle_id"], how="left", ) mask_particles_to_keep = particles["max_n_hits_per_plane"] <= 2 particles, hits_particles = apply_mask( mask_particles_to_keep, particles, hits_particles ) return hits_particles, particles
[docs]def remove_particles_too_scattered_on_plane( hits_particles: pd.DataFrame, particles: pd.DataFrame, max_xdiffs: float = 2.5 ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: particles_planes_groups = hits_particles.groupby( ["event_id", "particle_id", "plane"] ) xmins = particles_planes_groups["xatyeq0"].min().rename("xmin") xmaxs = particles_planes_groups["xatyeq0"].max().rename("xmax") xdiffs = xmaxs - xmins particles = particles.merge( right=xdiffs[xdiffs < max_xdiffs] .reset_index()[["event_id", "particle_id"]] .drop_duplicates(), how="inner", on=["event_id", "particle_id"], ) hits_particles = only_keep_hits_on_particles( hits_particles=hits_particles, particles=particles ) return hits_particles, particles
[docs]def remove_particle_not_poly_enough( hits_particles: pd.DataFrame, particles: pd.DataFrame, max_distance: float = 70.0 ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: poly_metrics = compute_particle_line_metrics_dataframe( hits_particles[hits_particles["dxdy"] == 0.0], metric_names=["distance_to_poly", "quadratic_coeff"], coord_names=get_coordinate_names("scifi_xz"), line_type="quadpoly_2d", ) particles = particles.merge( poly_metrics.reset_index(), how="left", on=["event_id", "particle_id"], ) particles["distance_to_poly"] = particles["distance_to_poly"].fillna(0.0) particles["quadratic_coeff"] = particles["quadratic_coeff"].fillna(0.0) particles, hits_particles = apply_mask( particles_mask=particles["distance_to_poly"] < max_distance, particles=particles, hits_particles=hits_particles, ) return hits_particles, particles