import typing
import logging
import numpy as np
import pandas as pd
from .particle_fitting_metrics import compute_particle_line_metrics_dataframe
from .hit_filtering import mask_long_into_small_tracks
from utils.commonutils.cdetector import get_coordinate_names
[docs]class SelectionFunction(typing.Protocol):
def __call__(
self, hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: ...
[docs]def apply_mask(
particles_mask: pd.Series,
particles: pd.DataFrame,
hits_particles: pd.DataFrame,
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""Apply a mask of particles to keep to both the dataframe of particles
and the dataframe of particles-hits.
Args:
particles_mask: The mask to apply, that corresponds to the particles to keep in
the dataframe ``particles``
particles: Dataframe of particles
hits_particles: Dataframe of hits-particles association
Returns:
Dataframe of ``particles`` and ``hits_particles`` that only contain
the particles and hits to keep.
"""
# About 3 seconds in a dataframe with 5000 events
hits_particles_mask = (
hits_particles[["event_id", "particle_id"]]
.reset_index()
.merge(
right=particles[["event_id", "particle_id"]].assign(mask_=particles_mask),
on=["event_id", "particle_id"],
how="left",
sort=False,
)
.set_index("index")["mask_"]
.fillna(True)
) # fillna to keep fake hits
return particles[particles_mask], hits_particles[hits_particles_mask]
[docs]def only_keep_hits_on_particles(hits_particles: pd.DataFrame, particles: pd.DataFrame):
return hits_particles.merge(
particles[["event_id", "particle_id"]],
how="inner",
on=["event_id", "particle_id"],
)
[docs]def only_long_electrons(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""Only keep long electrons.
Args:
hits_particles: Dataframe of hits-particles association
particles: Dataframe of particles
Returns:
Dataframe of hits-particles association and particles, filtered so that only
long electrons are left.
"""
# 1. Create a mask of the particles to keep:
mask_particles_to_keep = (
particles["has_velo"] & particles["has_scifi"] & (particles["pid"].abs() == 11)
)
# 2. Propagate the mask to the dataframe of `particles` and `hits_particles`
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
return hits_particles, particles
[docs]def everything_but_long_electrons(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""Only keep long electrons.
Args:
hits_particles: Dataframe of hits-particles association
particles: Dataframe of particles
Returns:
Dataframe of hits-particles association and particles, filtered so that only
long electrons are left.
"""
# 1. Create a mask of the particles to keep:
mask_particles_to_keep = particles["has_velo"] & ~(
particles["has_scifi"] & (particles["pid"].abs() == 11)
)
# 2. Propagate the mask to the dataframe of `particles` and `hits_particles`
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
return hits_particles, particles
[docs]def default_old_training_for_rta_presentation(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""Selection that was used in the training presented in the RTA meeting."""
# Drop duplicates hits
hits_particles = hits_particles.drop_duplicates(
subset=["event_id", "particle_id", "plane"], keep="first"
)
# Remove fake hits (there shouldn't be any already)
hits_particles = hits_particles[hits_particles["particle_id"] != 0]
# Compute distance to line and add it to the dataframe of particles
logging.info("Compute distance to line (that might take some time)")
new_distances = compute_particle_line_metrics_dataframe(
hits=hits_particles,
metric_names=["distance_to_line"],
)
particles = particles.merge(
new_distances, how="left", on=["event_id", "particle_id"]
)
# Only keep reconstructible particles that are straight enough
logging.info("Apply particle selection mask")
mask_particles_to_keep = (
(particles["has_velo"] == 1)
& (particles["nhits_velo"] >= 3)
& (particles["distance_to_line"] < np.sqrt(0.6))
)
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
# assert that there is not any nan values at this point
assert not particles.isna().any().any()
assert not hits_particles.isna().any().any()
return hits_particles, particles
[docs]def everything_but_electrons(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""Only keep long electrons.
Args:
hits_particles: Dataframe of hits-particles association
particles: Dataframe of particles
Returns:
Dataframe of hits-particles association and particles, filtered so that only
long electrons are left.
"""
# 1. Create a mask of the particles to keep:
mask_particles_to_keep = particles["has_velo"] & (particles["pid"].abs() != 11)
# 2. Propagate the mask to the dataframe of `particles` and `hits_particles`
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
return hits_particles, particles
[docs]def track_weighting_selection(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""The selection performed in the ``track-weighting`` experiment."""
# Only keep reconstructible particles that are straight enough
# Also remove the hits to avoid splitted tracks
# (we will counter-balance by requiring enough clusters / event)
logging.info("Compute distance to line (that might take some time)")
new_distances = compute_particle_line_metrics_dataframe(
hits=hits_particles, metric_names=["distance_to_line"]
)
particles = particles.merge(
new_distances, how="left", on=["event_id", "particle_id"]
)
mask_particles_to_keep = particles["distance_to_line"] < 0.8
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
# assert that there is not any nan values at this point
assert not particles.isna().any().any()
assert not hits_particles.isna().any().any()
return hits_particles, particles
[docs]def triplets_first_selection(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""The selection performed in the ``triplets-edge`` experiment."""
# Only keep one particles-hits association (drop duplicates)
hits_particles = hits_particles.drop_duplicates(
subset=["event_id", "hit_id"], keep="first"
)
# Only keep reconstructible particles that are straight enough
# Also remove the hits to avoid splitted tracks
# (we will counter-balance by requiring enough clusters / event)
logging.info("Compute distance to line (that might take some time)")
new_distances = compute_particle_line_metrics_dataframe(
hits=hits_particles, metric_names=["distance_to_line"]
)
particles = particles.merge(
new_distances, how="left", on=["event_id", "particle_id"]
)
mask_particles_to_keep = particles["distance_to_line"] < 0.8
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
# assert that there is not any nan values at this point
assert not particles.isna().any().any()
assert not hits_particles.isna().any()
return hits_particles, particles
[docs]def remove_curved_particles(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""Remove curved particles"""
# Only keep reconstructible particles that are straight enough
# Also remove the hits to avoid splitted tracks
# (we will counter-balance by requiring enough clusters / event)
logging.info("Compute distance to line (that might take some time)")
new_distances = compute_particle_line_metrics_dataframe(
hits=hits_particles, metric_names=["distance_to_line"]
)
particles = particles.merge(
new_distances, how="left", on=["event_id", "particle_id"]
)
mask_particles_to_keep = particles["distance_to_line"] < 0.8
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
# assert that there is not any nan values at this point
assert not particles.isna().any().any()
assert not hits_particles.isna().any().any()
return hits_particles, particles
[docs]def compute_n_unique_planes(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""Compute number of unique planes for each particle."""
# We'll train with all the hits for the training
# And cut before the GNN
n_unique_planes = (
hits_particles.groupby(["event_id", "particle_id"])["plane"]
.nunique()
.rename("n_unique_planes")
)
particles = particles.merge(
n_unique_planes, how="left", on=["event_id", "particle_id"]
).fillna(0)
return hits_particles, particles
[docs]def compute_n_particles_per_hit(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
"""Compute number of unique planes for each particle."""
n_particles_per_hit = (
hits_particles.groupby(["event_id", "hit_id"])["particle_id"]
.count()
.rename("n_particles_per_hit")
)
hits_particles = hits_particles.merge(
n_particles_per_hit,
on=["event_id", "hit_id"],
how="left",
)
return hits_particles, particles
[docs]def cut_long_tracks(
hits_particles: pd.DataFrame,
particles: pd.DataFrame,
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
# Get tracks with more than 7 hits
if "nhits_velo" not in hits_particles:
hits_particles = hits_particles.merge(
particles[["event_id", "particle_id", "nhits_velo"]],
how="left",
on=["event_id", "particle_id"],
)
mask_long = hits_particles["nhits_velo"] >= 7
hits_particles_long = hits_particles[mask_long]
hits_particles_short = hits_particles[~mask_long]
# cut long tracks to 3, 4, 5 or 6 hits, with many more with 3 or 4 hits
mask_long_into_small = mask_long_into_small_tracks(
hits_particles=hits_particles_long,
track_size_proportions={
3: 0.50,
4: 0.30,
5: 0.10,
6: 0.02,
-1: 0.08, # 8 % untouched
},
)
hits_particles = pd.concat(
(hits_particles_short, hits_particles_long[mask_long_into_small])
)
return hits_particles, particles
[docs]def reconstructible_scifi(hits_particles: pd.DataFrame, particles: pd.DataFrame):
mask_particles_to_keep = particles["has_scifi"]
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
return hits_particles, particles
[docs]def at_least_7_planes(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
plane_groups = hits_particles.groupby(["event_id", "particle_id"])["plane"]
diff_planes = plane_groups.max() - plane_groups.min()
particles = particles.merge(
diff_planes.rename("plane_diff").reset_index(),
how="left",
on=["event_id", "particle_id"],
)
mask_particles_to_keep = particles["plane_diff"] >= 7
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
return hits_particles, particles
[docs]def at_least_1_hit_on_scifi(hits_particles: pd.DataFrame, particles: pd.DataFrame):
particles = particles[particles["nhits_scifi"] >= 1]
return hits_particles, particles
[docs]def less_than_3_hits_on_same_plane(
hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
particles = particles.merge(
hits_particles.groupby(["event_id", "particle_id", "plane"])["hit_id"]
.count()
.groupby(["event_id", "particle_id"])
.max()
.rename("max_n_hits_per_plane")
.reset_index(),
on=["event_id", "particle_id"],
how="left",
)
mask_particles_to_keep = particles["max_n_hits_per_plane"] <= 2
particles, hits_particles = apply_mask(
mask_particles_to_keep, particles, hits_particles
)
return hits_particles, particles
[docs]def remove_particles_too_scattered_on_plane(
hits_particles: pd.DataFrame, particles: pd.DataFrame, max_xdiffs: float = 2.5
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
particles_planes_groups = hits_particles.groupby(
["event_id", "particle_id", "plane"]
)
xmins = particles_planes_groups["xatyeq0"].min().rename("xmin")
xmaxs = particles_planes_groups["xatyeq0"].max().rename("xmax")
xdiffs = xmaxs - xmins
particles = particles.merge(
right=xdiffs[xdiffs < max_xdiffs]
.reset_index()[["event_id", "particle_id"]]
.drop_duplicates(),
how="inner",
on=["event_id", "particle_id"],
)
hits_particles = only_keep_hits_on_particles(
hits_particles=hits_particles, particles=particles
)
return hits_particles, particles
[docs]def remove_particle_not_poly_enough(
hits_particles: pd.DataFrame, particles: pd.DataFrame, max_distance: float = 70.0
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
poly_metrics = compute_particle_line_metrics_dataframe(
hits_particles[hits_particles["dxdy"] == 0.0],
metric_names=["distance_to_poly", "quadratic_coeff"],
coord_names=get_coordinate_names("scifi_xz"),
line_type="quadpoly_2d",
)
particles = particles.merge(
poly_metrics.reset_index(),
how="left",
on=["event_id", "particle_id"],
)
particles["distance_to_poly"] = particles["distance_to_poly"].fillna(0.0)
particles["quadratic_coeff"] = particles["quadratic_coeff"].fillna(0.0)
particles, hits_particles = apply_mask(
particles_mask=particles["distance_to_poly"] < max_distance,
particles=particles,
hits_particles=hits_particles,
)
return hits_particles, particles