Source code for pipeline.Preprocessing.hit_filtering

"""A module that implements the ability of filter hits, grouping them by
particles.
"""
from __future__ import annotations
import typing
from math import isclose

import numpy as np
import numpy.typing as npt
import pandas as pd
import numba as nb

from utils.tools.tgroupby import get_group_indices


@nb.jit(nopython=True, cache=True)
def cut_long_tracks_impl(
    array_mask: npt.NDArray[np.bool_],
    event_ids: npt.NDArray[np.int_],
    particle_ids: npt.NDArray[np.int_],
    track_sizes: npt.NDArray[np.int_],
    proportions: npt.NDArray[np.float_],
    rng: np.random.Generator,
) -> None:
    """Cut long tracks to get smaller tracks. The first hits are removed.

    Args:
        array_mask: the array of mask that indicates the hits that are kept
        event_ids: array of event IDs
        particle_ids: array of particle IDs
        track_sizes: array of track sizes
        proportions: array of proportion of track sizes, corresponding to
            ``track_sizes``
        rng: random generator for which track is cut to which size
    """
    indices_groupby_events = get_group_indices(event_ids)

    for event_start_idx, event_end_idx in zip(
        indices_groupby_events[:-1],
        indices_groupby_events[1:],
    ):
        event_array_mask = array_mask[event_start_idx:event_end_idx]

        event_particle_ids = particle_ids[event_start_idx:event_end_idx]
        indices_groupby_particles = get_group_indices(event_particle_ids)

        n_particles = event_end_idx - event_start_idx

        # Array that contains for every event how many hits it should have
        # given `track_size_proportions`
        event_required_n_hits = np.zeros(
            shape=n_particles,
            dtype=indices_groupby_particles.dtype,
        )

        current_particle_idx = 0
        for track_size, proportion in zip(track_sizes, proportions):
            # how many particles with the given track size
            n_particles_for_track_size = round(n_particles * proportion)

            event_required_n_hits[
                current_particle_idx : current_particle_idx + n_particles_for_track_size
            ] = track_size
            current_particle_idx += n_particles_for_track_size

        # Shuffle track size
        rng.shuffle(event_required_n_hits)

        for particle_start_idx, particle_end_idx, particle_required_n_hits in zip(
            indices_groupby_particles[:-1],
            indices_groupby_particles[1:],
            event_required_n_hits,
        ):
            particle_array_mask = event_array_mask[particle_start_idx:particle_end_idx]
            if particle_required_n_hits != 0:
                particle_array_mask[-particle_required_n_hits:] = True
            elif particle_required_n_hits == -1:  # keep all hits
                particle_array_mask[:] = True


[docs]def mask_long_into_small_tracks(
    hits_particles: pd.DataFrame,
    track_size_proportions: typing.Dict[int, float],
    seed: int | None = None,
) -> pd.Series:
    """Create a mask to remove the first hits of long tracks to match the proportions
    of track sizes given as input.

    Args:
        hits_particles: dataframe of hits-particles
        track_sizes: dictionary that associates a track size with the expected
            proportion after the cut.
        seed: Random seed for which track is cut to which size

    Returns:
        Pandas series indexed by `event`, `particle_id` and `hit_id`,
        which indicates which hits are kept
    """
    track_sizes = np.array(list(track_size_proportions.keys()))
    proportions = np.array(list(track_size_proportions.values()))

    sum_props = sum(track_size_proportions.values())
    assert isclose(
        sum_props, 1.0
    ), f"The sum of proportions in `track_sizes` is equal to {sum_props} != 1.0"

    sorted_hits_particles = hits_particles.sort_values(
        by=["event_id", "particle_id", "plane"]
    )

    array_mask = np.zeros(shape=(sorted_hits_particles.shape[0]), dtype=bool)

    rng = np.random.default_rng(seed=seed)

    cut_long_tracks_impl(
        array_mask=array_mask,
        event_ids=sorted_hits_particles["event_id"].to_numpy(),
        particle_ids=sorted_hits_particles["particle_id"].to_numpy(),
        track_sizes=track_sizes,
        proportions=proportions,
        rng=rng,
    )

    return pd.Series(
        array_mask,
        index=sorted_hits_particles.index,
        name="hits_mask",
    ).reindex(hits_particles.index)