Source code for pipeline.Preprocessing.process_custom

import typing
import logging
import numpy as np
import pandas as pd
from .particle_fitting_metrics import compute_particle_line_metrics_dataframe
from .hit_filtering import mask_long_into_small_tracks
from utils.commonutils.cdetector import get_coordinate_names


[docs]class SelectionFunction(typing.Protocol):
    def __call__(
        self, hits_particles: pd.DataFrame, particles: pd.DataFrame
    ) -> typing.Tuple[pd.DataFrame, pd.DataFrame]: ...


[docs]def apply_mask(
    particles_mask: pd.Series,
    particles: pd.DataFrame,
    hits_particles: pd.DataFrame,
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Apply a mask of particles to keep to both the dataframe of particles
    and the dataframe of particles-hits.

    Args:
        particles_mask: The mask to apply, that corresponds to the particles to keep in
            the dataframe ``particles``
        particles: Dataframe of particles
        hits_particles: Dataframe of hits-particles association

    Returns:
        Dataframe of ``particles`` and ``hits_particles`` that only contain
        the particles and hits to keep.
    """
    # About 3 seconds in a dataframe with 5000 events
    hits_particles_mask = (
        hits_particles[["event_id", "particle_id"]]
        .reset_index()
        .merge(
            right=particles[["event_id", "particle_id"]].assign(mask_=particles_mask),
            on=["event_id", "particle_id"],
            how="left",
            sort=False,
        )
        .set_index("index")["mask_"]
        .fillna(True)
    )  # fillna to keep fake hits

    return particles[particles_mask], hits_particles[hits_particles_mask]


[docs]def only_keep_hits_on_particles(hits_particles: pd.DataFrame, particles: pd.DataFrame):
    return hits_particles.merge(
        particles[["event_id", "particle_id"]],
        how="inner",
        on=["event_id", "particle_id"],
    )


[docs]def only_long_electrons(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Only keep long electrons.

    Args:
        hits_particles: Dataframe of hits-particles association
        particles: Dataframe of particles

    Returns:
        Dataframe of hits-particles association and particles, filtered so that only
        long electrons are left.
    """
    # 1. Create a mask of the particles to keep:
    mask_particles_to_keep = (
        particles["has_velo"] & particles["has_scifi"] & (particles["pid"].abs() == 11)
    )

    # 2. Propagate the mask to the dataframe of `particles` and `hits_particles`
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )
    return hits_particles, particles


[docs]def everything_but_long_electrons(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Only keep long electrons.

    Args:
        hits_particles: Dataframe of hits-particles association
        particles: Dataframe of particles

    Returns:
        Dataframe of hits-particles association and particles, filtered so that only
        long electrons are left.
    """
    # 1. Create a mask of the particles to keep:
    mask_particles_to_keep = particles["has_velo"] & ~(
        particles["has_scifi"] & (particles["pid"].abs() == 11)
    )

    # 2. Propagate the mask to the dataframe of `particles` and `hits_particles`
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )
    return hits_particles, particles


[docs]def default_old_training_for_rta_presentation(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Selection that was used in the training presented in the RTA meeting."""

    # Drop duplicates hits
    hits_particles = hits_particles.drop_duplicates(
        subset=["event_id", "particle_id", "plane"], keep="first"
    )
    # Remove fake hits (there shouldn't be any already)
    hits_particles = hits_particles[hits_particles["particle_id"] != 0]

    # Compute distance to line and add it to the dataframe of particles
    logging.info("Compute distance to line (that might take some time)")
    new_distances = compute_particle_line_metrics_dataframe(
        hits=hits_particles,
        metric_names=["distance_to_line"],
    )
    particles = particles.merge(
        new_distances, how="left", on=["event_id", "particle_id"]
    )

    # Only keep reconstructible particles that are straight enough
    logging.info("Apply particle selection mask")
    mask_particles_to_keep = (
        (particles["has_velo"] == 1)
        & (particles["nhits_velo"] >= 3)
        & (particles["distance_to_line"] < np.sqrt(0.6))
    )
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )
    # assert that there is not any nan values at this point
    assert not particles.isna().any().any()
    assert not hits_particles.isna().any().any()

    return hits_particles, particles


[docs]def everything_but_electrons(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Only keep long electrons.

    Args:
        hits_particles: Dataframe of hits-particles association
        particles: Dataframe of particles

    Returns:
        Dataframe of hits-particles association and particles, filtered so that only
        long electrons are left.
    """
    # 1. Create a mask of the particles to keep:
    mask_particles_to_keep = particles["has_velo"] & (particles["pid"].abs() != 11)

    # 2. Propagate the mask to the dataframe of `particles` and `hits_particles`
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )
    return hits_particles, particles


[docs]def track_weighting_selection(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """The selection performed in the ``track-weighting`` experiment."""

    # Only keep reconstructible particles that are straight enough
    # Also remove the hits to avoid splitted tracks
    # (we will counter-balance by requiring enough clusters / event)
    logging.info("Compute distance to line (that might take some time)")
    new_distances = compute_particle_line_metrics_dataframe(
        hits=hits_particles, metric_names=["distance_to_line"]
    )
    particles = particles.merge(
        new_distances, how="left", on=["event_id", "particle_id"]
    )
    mask_particles_to_keep = particles["distance_to_line"] < 0.8
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )

    # assert that there is not any nan values at this point
    assert not particles.isna().any().any()
    assert not hits_particles.isna().any().any()

    return hits_particles, particles


[docs]def triplets_first_selection(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """The selection performed in the ``triplets-edge`` experiment."""
    # Only keep one particles-hits association (drop duplicates)
    hits_particles = hits_particles.drop_duplicates(
        subset=["event_id", "hit_id"], keep="first"
    )

    # Only keep reconstructible particles that are straight enough
    # Also remove the hits to avoid splitted tracks
    # (we will counter-balance by requiring enough clusters / event)
    logging.info("Compute distance to line (that might take some time)")
    new_distances = compute_particle_line_metrics_dataframe(
        hits=hits_particles, metric_names=["distance_to_line"]
    )
    particles = particles.merge(
        new_distances, how="left", on=["event_id", "particle_id"]
    )
    mask_particles_to_keep = particles["distance_to_line"] < 0.8
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )

    # assert that there is not any nan values at this point
    assert not particles.isna().any().any()
    assert not hits_particles.isna().any()

    return hits_particles, particles


[docs]def remove_curved_particles(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Remove curved particles"""

    # Only keep reconstructible particles that are straight enough
    # Also remove the hits to avoid splitted tracks
    # (we will counter-balance by requiring enough clusters / event)
    logging.info("Compute distance to line (that might take some time)")
    new_distances = compute_particle_line_metrics_dataframe(
        hits=hits_particles, metric_names=["distance_to_line"]
    )
    particles = particles.merge(
        new_distances, how="left", on=["event_id", "particle_id"]
    )
    mask_particles_to_keep = particles["distance_to_line"] < 0.8
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )

    # assert that there is not any nan values at this point
    assert not particles.isna().any().any()
    assert not hits_particles.isna().any().any()

    return hits_particles, particles


[docs]def compute_n_unique_planes(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Compute number of unique planes for each particle."""

    # We'll train with all the hits for the training
    # And cut before the GNN
    n_unique_planes = (
        hits_particles.groupby(["event_id", "particle_id"])["plane"]
        .nunique()
        .rename("n_unique_planes")
    )
    particles = particles.merge(
        n_unique_planes, how="left", on=["event_id", "particle_id"]
    ).fillna(0)

    return hits_particles, particles


[docs]def compute_n_particles_per_hit(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Compute number of unique planes for each particle."""

    n_particles_per_hit = (
        hits_particles.groupby(["event_id", "hit_id"])["particle_id"]
        .count()
        .rename("n_particles_per_hit")
    )

    hits_particles = hits_particles.merge(
        n_particles_per_hit,
        on=["event_id", "hit_id"],
        how="left",
    )

    return hits_particles, particles


[docs]def cut_long_tracks(
    hits_particles: pd.DataFrame,
    particles: pd.DataFrame,
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    # Get tracks with more than 7 hits
    if "nhits_velo" not in hits_particles:
        hits_particles = hits_particles.merge(
            particles[["event_id", "particle_id", "nhits_velo"]],
            how="left",
            on=["event_id", "particle_id"],
        )

    mask_long = hits_particles["nhits_velo"] >= 7

    hits_particles_long = hits_particles[mask_long]
    hits_particles_short = hits_particles[~mask_long]

    # cut long tracks to 3, 4, 5 or 6 hits, with many more with 3 or 4 hits
    mask_long_into_small = mask_long_into_small_tracks(
        hits_particles=hits_particles_long,
        track_size_proportions={
            3: 0.50,
            4: 0.30,
            5: 0.10,
            6: 0.02,
            -1: 0.08,  # 8 % untouched
        },
    )

    hits_particles = pd.concat(
        (hits_particles_short, hits_particles_long[mask_long_into_small])
    )

    return hits_particles, particles


[docs]def reconstructible_scifi(hits_particles: pd.DataFrame, particles: pd.DataFrame):
    mask_particles_to_keep = particles["has_scifi"]
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )
    return hits_particles, particles


[docs]def at_least_7_planes(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    plane_groups = hits_particles.groupby(["event_id", "particle_id"])["plane"]
    diff_planes = plane_groups.max() - plane_groups.min()
    particles = particles.merge(
        diff_planes.rename("plane_diff").reset_index(),
        how="left",
        on=["event_id", "particle_id"],
    )
    mask_particles_to_keep = particles["plane_diff"] >= 7
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )
    return hits_particles, particles


[docs]def at_least_1_hit_on_scifi(hits_particles: pd.DataFrame, particles: pd.DataFrame):
    particles = particles[particles["nhits_scifi"] >= 1]
    return hits_particles, particles


[docs]def less_than_3_hits_on_same_plane(
    hits_particles: pd.DataFrame, particles: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    particles = particles.merge(
        hits_particles.groupby(["event_id", "particle_id", "plane"])["hit_id"]
        .count()
        .groupby(["event_id", "particle_id"])
        .max()
        .rename("max_n_hits_per_plane")
        .reset_index(),
        on=["event_id", "particle_id"],
        how="left",
    )
    mask_particles_to_keep = particles["max_n_hits_per_plane"] <= 2
    particles, hits_particles = apply_mask(
        mask_particles_to_keep, particles, hits_particles
    )
    return hits_particles, particles


[docs]def remove_particles_too_scattered_on_plane(
    hits_particles: pd.DataFrame, particles: pd.DataFrame, max_xdiffs: float = 2.5
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    particles_planes_groups = hits_particles.groupby(
        ["event_id", "particle_id", "plane"]
    )
    xmins = particles_planes_groups["xatyeq0"].min().rename("xmin")
    xmaxs = particles_planes_groups["xatyeq0"].max().rename("xmax")
    xdiffs = xmaxs - xmins

    particles = particles.merge(
        right=xdiffs[xdiffs < max_xdiffs]
        .reset_index()[["event_id", "particle_id"]]
        .drop_duplicates(),
        how="inner",
        on=["event_id", "particle_id"],
    )
    hits_particles = only_keep_hits_on_particles(
        hits_particles=hits_particles, particles=particles
    )

    return hits_particles, particles


[docs]def remove_particle_not_poly_enough(
    hits_particles: pd.DataFrame, particles: pd.DataFrame, max_distance: float = 70.0
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    poly_metrics = compute_particle_line_metrics_dataframe(
        hits_particles[hits_particles["dxdy"] == 0.0],
        metric_names=["distance_to_poly", "quadratic_coeff"],
        coord_names=get_coordinate_names("scifi_xz"),
        line_type="quadpoly_2d",
    )

    particles = particles.merge(
        poly_metrics.reset_index(),
        how="left",
        on=["event_id", "particle_id"],
    )
    particles["distance_to_poly"] = particles["distance_to_poly"].fillna(0.0)
    particles["quadratic_coeff"] = particles["quadratic_coeff"].fillna(0.0)

    particles, hits_particles = apply_mask(
        particles_mask=particles["distance_to_poly"] < max_distance,
        particles=particles,
        hits_particles=hits_particles,
    )

    return hits_particles, particles