Source code for pipeline.Preprocessing.particle_fitting_metrics

"""A module that implements the fitting of particle trajectories to straight
lines, in order to evaluate if the particle tracks are compatible with
straight lines.

This can be used for instance in order to only keep straight lines in the training
sample, in order to avoid training on weird tracks.
"""

import typing
import numpy as np
import pandas as pd
import numba as nb

from utils.tools.tgroupby import get_group_indices
from . import line_metrics, poly_metrics


@nb.jit(nopython=True, cache=True)
def compute_particle_line_metrics(
    particle_coords: np.ndarray,
    metric_names: typing.List[str],
    line_type: typing.Literal["line_3d", "quadpoly_2d"] = "line_3d",
) -> np.ndarray:
    n_hits = particle_coords.shape[0]
    if n_hits <= 1:
        return np.full(shape=len(metric_names), fill_value=np.nan)
    else:
        metric_values = np.zeros(shape=len(metric_names))
        if line_type == "line_3d":
            point, dir_vector = line_metrics.fit_line(particle_coords)

            for idx, metric_name in enumerate(metric_names):
                if metric_name == "distance_to_line":
                    metric_values[idx] = line_metrics.compute_distance_to_line(
                        particle_coords, point, dir_vector
                    )
                elif metric_name == "distance_to_z_axis":
                    metric_values[
                        idx
                    ] = line_metrics.compute_distance_to_z_axis(
                        particle_coords, point, dir_vector
                    )
                elif metric_name == "xz_angle":
                    metric_values[
                        idx
                    ] = line_metrics.compute_distance_to_z_axis(
                        particle_coords, point, dir_vector
                    )
                elif metric_name == "yz_angle":
                    metric_values[
                        idx
                    ] = line_metrics.compute_distance_to_z_axis(
                        particle_coords, point, dir_vector
                    )
                else:
                    print("Metric name:", metric_name)
                    raise ValueError("Metric name is not recognised.")

        elif line_type == "quadpoly_2d":
            if n_hits >= 3:
                coeffs = poly_metrics.fit_poly(
                    x=particle_coords[:, 0], y=particle_coords[:, 1], deg=2
                )
                for idx, metric_name in enumerate(metric_names):
                    if metric_name == "distance_to_poly":
                        metric_values[
                            idx
                        ] = poly_metrics.compute_distance_to_poly(
                            particle_coords, coeffs
                        )
                    elif metric_name == "quadratic_coeff":
                        metric_values[
                            idx
                        ] = poly_metrics.compute_quadratic_coeff(
                            particle_coords, coeffs
                        )
                    else:
                        print("Metric name:", metric_name)
                        raise ValueError("Metric name is not recognised.")
        else:
            raise ValueError("`Line type is not recognised.")

        return metric_values


@nb.jit(nopython=True, cache=True)
def compute_particle_metric(
    coords_particles: np.ndarray,
    particle_ids: np.ndarray,
    array_metric_values: np.ndarray,
    metric_names: typing.List[str],
    line_type: typing.Literal["line_3d", "quadpoly_2d"] = "line_3d",
    indices_groupby_particles_: typing.Optional[np.ndarray] = None,
):
    """Compute the distances between the particle hits, and a straight line fitted
    to these hit coordinates. Fill the pre-allocated array of distances ``distances``.

    Args:
        coords_particles: array of hit cartesian coordinates, sorted by particle IDs.
        particle_ids: Sorted array of particle IDs for every hits, whose coordinates
            are given in ``coords_particles``. There are as many elements in
            ``particle_ids`` as they are hits in ``coords_particles``
        array_metric_values: Empty array of the metric values to compute,
            for every unique particle ID in ``particle_ids``.
            This way, the memory space is pre-allocated.
            The array as many column as they are metrics to compute.
        metric_names: List of the names of the metrics to compute.
        indices_groupby_particles_: Array that contains the starting indices of every
            group of values in ``particle_ids``. If not given, it is computed for you
            anyway

    Notes:
        This function is basically called for every event.
    """
    if indices_groupby_particles_ is None:
        indices_groupby_particles = get_group_indices(particle_ids)
    else:
        indices_groupby_particles = indices_groupby_particles_

    for particle_idx, (start_idx, end_idx) in enumerate(
        zip(
            indices_groupby_particles[:-1],
            indices_groupby_particles[1:],
        )
    ):
        coords = coords_particles[start_idx:end_idx, :]
        array_metric_values[particle_idx, :] = compute_particle_line_metrics(
            particle_coords=coords,
            metric_names=metric_names,
            line_type=line_type,
        )


@nb.jit(nopython=True, cache=True)
def compute_particle_line_metrics_events_impl(
    array_metric_values: np.ndarray,
    coords_events_particles: np.ndarray,
    event_ids: np.ndarray,
    particle_ids: np.ndarray,
    metric_names: typing.List[str],
    line_type: typing.Literal["line_3d", "quadpoly_2d"] = "line_3d",
):
    """Compute the distances between the particle hits, and a straight line fitted
    to these hit coordinates, for all the events. Fill the pre-allocated array of
    distances ``distances``.

    Args:
        array_metric_values: Empty array of the metric values to compute,
            for every unique particle ID in ``particle_ids``.
            This way, the memory space is pre-allocated.
            The array as many column as they are metrics to compute.
        coords_events_particles: Cartesian coordinates of the hits, sorted by event IDs
            and particle IDs
        event_ids: sorted array of event IDs for every hit coordinates in
            ``coords_events_particles``
        particle_ids: sorted array of particle IDs group by event ID (in ``event_ids``),
            for every hit coordinates in ``coords_events_particles``
        metric_names: List of the names of the metrics to compute.
    """
    indices_groupby_events = get_group_indices(event_ids)
    event_idx = 0
    for start_idx, end_idx in zip(
        indices_groupby_events[:-1],
        indices_groupby_events[1:],
    ):
        particle_ids_event = particle_ids[start_idx:end_idx]
        indices_groupby_particles = get_group_indices(particle_ids_event)
        n_particles = indices_groupby_particles.shape[0] - 1

        compute_particle_metric(
            coords_particles=coords_events_particles[start_idx:end_idx],
            particle_ids=particle_ids_event,
            array_metric_values=array_metric_values[
                event_idx : event_idx + n_particles, :
            ],
            indices_groupby_particles_=indices_groupby_particles,
            metric_names=metric_names,
            line_type=line_type,
        )
        event_idx += n_particles


[docs]def compute_particle_line_metrics_dataframe(
    hits: pd.DataFrame,
    metric_names: typing.List[str],
    coord_names: typing.List[str] = ["x", "y", "z"],
    line_type: typing.Literal["line_3d", "quadpoly_2d"] = "line_3d",
    event_id_column: str = "event_id",
) -> pd.DataFrame:
    """Compute the pandas Series of the distance from particle hits to straight lines
    fitted to these lines. The "distance" actually corresponds to the square-root
    of the average of the squared distance between every hit and the straight line.

    Args:
        hits: Pandas DataFrame of hits-particles associations, with columns
            ``event``, ``particle_id`` and the cartesian coordinates ``x``, ``y``
            and ``z``.
        metric_names: List of the metric names to compute, which can be

            * ``distance_to_line``
            * ``distance_to_z_axis``
            * ``xz_angle``
            * ``yz_angle``

        linetype: str,
        event_id_column: name of the event ID column

    Returns:
        A pandas Series with index ``event`` and ``particle_id``, and for every
        particle that has hits, the metrics specified in ``metric_names``
    """
    if line_type == "line_3d":
        assert len(coord_names) == 3
    elif line_type == "quadpoly_2d":
        assert len(coord_names) == 2
    else:
        raise ValueError(
            "`line_type` should either be `line_3d` or `quadpoly_2d` but is "
            f"{line_type}"
        )

    hits = hits.sort_values(by=[event_id_column, "particle_id"])
    events_particles_group = hits.groupby(
        [event_id_column, "particle_id"], sort=False
    ).size()
    n_particles = events_particles_group.shape[0]
    array_metric_values = np.zeros(shape=(n_particles, len(metric_names)))
    compute_particle_line_metrics_events_impl(
        event_ids=hits[event_id_column].to_numpy(),
        particle_ids=hits["particle_id"].to_numpy(),
        coords_events_particles=hits[coord_names].to_numpy(),
        array_metric_values=array_metric_values,
        metric_names=metric_names,
        line_type=line_type,
    )
    return pd.DataFrame(
        array_metric_values,
        index=events_particles_group.index,
        columns=metric_names,
    )