Source code for pipeline.utils.graphutils.batch2df

"""A python module that allows to build dataframes directly from a batch PyTorch
data object.
"""

import typing
import torch
from torch_geometric.data import Data
from ..tools import tarray


[docs]def get_df_hits(batch: Data, hit_columns: typing.List[str]) -> tarray.DataFrame:
    use_cuda = batch[hit_columns[0]].device.type == "cuda"
    cp_or_np = tarray.get_numpy_or_cupy(use_cuda=use_cuda)
    df_hits = tarray.to_dataframe(
        {
            "hit_idx": cp_or_np.arange(batch[hit_columns[0]].shape[0]),
            **{hit_column: batch[hit_column] for hit_column in hit_columns},
        },
        use_cuda=use_cuda,
    )
    return df_hits


[docs]def get_df_hits_particles_from_particle_id_hit_idx(
    particle_id_hit_idx: torch.Tensor,
) -> tarray.DataFrame:
    use_cuda = particle_id_hit_idx.device.type == "cuda"
    df_hits_particles = tarray.to_dataframe(
        {
            "particle_id": particle_id_hit_idx[:, 0],
            "hit_idx": particle_id_hit_idx[:, 1],
        },
        use_cuda=use_cuda,
    )
    return df_hits_particles


[docs]def get_df_hits_particles(
    batch: Data,
    particle_columns: typing.List[str] | None = None,
) -> tarray.DataFrame:
    """Get the dataframe of hits-particles.

    Args:
        batch: PyTorch Data object that contains the tensor ``particle_id_hit_idx``
        particle_columns: A list of particle columns to merge to the outputted
            dataframe.
            The particle column names are expected to be prefixed by ``particle_``
            in ``batch``.

    Returns:
        Dataframe of hits-particles with columns ``particle_id``,
        ``hit_idx``, ``hit_particle_idx`` and the columns ``particle_columns``.
    """
    df_hits_particles = get_df_hits_particles_from_particle_id_hit_idx(
        particle_id_hit_idx=batch["particle_id_hit_idx"]
    )
    if particle_columns is not None:
        df_particles = tarray.to_dataframe(
            {
                "particle_id": batch["unique_particle_id"],
                **{
                    particle_column: batch[f"particle_{particle_column}"]
                    for particle_column in particle_columns
                },
            },
            use_cuda=tarray.get_use_cuda_from_dataframe(df_hits_particles),
        )
        df_hits_particles = df_hits_particles.merge(
            df_particles,
            on=["particle_id"],
            how="left",
        )
    return df_hits_particles


[docs]def merge_df_hits_particles_to_edges(
    df_edges: tarray.DataFrame,
    df_hits_particles: tarray.DataFrame,
    combine_particle_id: bool = False,
) -> tarray.DataFrame:
    """Merge the dataframe of edges to the left and right hits of the dataframe
    of edges.

    Args:
        df_edges: Dataframe of edges, at least with columns
            ``hit_idx_left``, ``hit_idx_right``
        df_hits_particles: Dataframe of hits particles, at least with column
            ``hit_idx`` and ``particle_id``
        combine_particle_id: whether to combine
            ``particle_id_left`` and ``particle_id_right``
            into ``particle_id``

    Returns:
        Dataframe of edges-particles.
    """
    assert tarray.get_use_cuda_from_dataframe(
        df_edges
    ) == tarray.get_use_cuda_from_dataframe(df_hits_particles), (
        "Provided dataframes are not on the same " "device / host."
    )

    df_edges_particles = df_edges  # WARNING: not a copy
    for side in ["left", "right"]:
        df_edges_particles = df_edges_particles.merge(
            df_hits_particles.rename(
                columns={
                    column: f"{column}_{side}" for column in df_hits_particles.columns
                }
            ),  # type: ignore
            on=f"hit_idx_{side}",
            how="left",
        )

    # Only keep on particle_id
    if combine_particle_id:
        # Only keep only particle_id
        particle_left_eq_right_mask = (
            df_edges_particles["particle_id_left"]
            == df_edges_particles["particle_id_right"]
        )
        df_edges_particles["particle_id"] = 0
        df_edges_particles.loc[particle_left_eq_right_mask, "particle_id"] = (  # type: ignore
            df_edges_particles["particle_id_left"]
        )
        df_edges_particles.drop(
            ["particle_id_left", "particle_id_right"], axis=1, inplace=True
        )


    return df_edges_particles


[docs]def get_df_edges(
    batch: Data,
    df_hits_particles: tarray.DataFrame | None = None,
    combine_particle_id: bool = False,
) -> tarray.DataFrame:
    """Get the dataframe of edges.

    Args:
        batch: PyTorch Data object that contains the tensors ``edge_index`` and ``y``.
        df_hits_particles: Optional dataframe of hits-particles to merge to
            to the left and right hits of the dataframe of edges

    Returns:
        Dataframe of edges, with columns ``hit_idx_left``, ``hit_idx_right``,
        ``y``, ``edge_idx`` and the columns provided in ``df_hits_particles``
        suffixed by ``_left`` and ``_right``.
    """
    df_edges = get_df_edges_from_edge_index(
        edge_index=batch["edge_index"], tensors={"y": batch["y"]}
    )

    if df_hits_particles is not None:
        df_edges = merge_df_hits_particles_to_edges(
            df_edges=df_edges,
            df_hits_particles=df_hits_particles,
            combine_particle_id=combine_particle_id,
        )

    return df_edges  # type: ignore


[docs]def get_df_edges_from_edge_index(
    edge_index: torch.Tensor,
    tensors: typing.Dict[str, torch.Tensor] | None = None,
) -> tarray.DataFrame:
    use_cuda = edge_index.device.type == "cuda"
    cp_or_np = tarray.get_numpy_or_cupy(use_cuda=use_cuda)
    dict_edges = {
        "hit_idx_left": edge_index[0],
        "hit_idx_right": edge_index[1],
        "edge_idx": cp_or_np.arange(edge_index.shape[1]),
    }
    if tensors is not None:
        dict_edges.update(tensors)

    df_edges = tarray.to_dataframe(dict_edges, use_cuda=use_cuda)

    # assert not df_edges.duplicated(
    #     ["hit_idx_left", "hit_idx_right"]
    # ).any(), "The edges contain duplicates."
    return df_edges


[docs]def get_df_triplets_from_triplet_index(triplet_index: torch.Tensor) -> tarray.DataFrame:
    use_cuda = triplet_index.device.type == "cuda"
    cp_or_np = tarray.get_numpy_or_cupy(use_cuda=use_cuda)
    return tarray.to_dataframe(
        {
            "edge_idx_1": triplet_index[0],
            "edge_idx_2": triplet_index[1],
            "triplet_idx": cp_or_np.arange(triplet_index.shape[1]),
        },
        use_cuda=use_cuda,
    )