Source code for pipeline.utils.tools.tarray

"""A module that allows to handle conversion between tensors, arrays and dataframes,
on CPU (numpy and pandas) or GPU (cupy and cudf).
"""
from __future__ import annotations
import typing
import numpy as np
import pandas as pd
import torch

try:
    import cupy as cp

except ImportError:
    cp = None

try:
    import cudf

except ImportError:
    cudf = None


Array = (
    typing.TypeVar("Array", np.ndarray, cp.ndarray) if cp is not None else np.ndarray
)
TensorOrArray = (
    typing.TypeVar("TensorOrArray", torch.Tensor, np.ndarray, cp.ndarray)
    if cp is not None
    else typing.TypeVar("TensorOrArray", torch.Tensor, np.ndarray)
)

Series = (
    typing.TypeVar("Series", pd.Series, cudf.Series) if cudf is not None else pd.Series
)
DataFrame = (
    typing.TypeVar("DataFrame", pd.DataFrame, cudf.DataFrame)
    if cudf is not None
    else pd.DataFrame
)


[docs]def series_to_array(series: Series | DataFrame) -> Array:
    """Turn a Pandas/cudf dataframe or series into a numpy/cupy array."""
    if isinstance(series, (pd.Series, pd.DataFrame)):
        return series.to_numpy()  # type: ignore
    if cudf is not None and isinstance(series, (cudf.Series, cudf.DataFrame)):
        array = series.to_cupy()
        return array
    else:
        raise TypeError()


[docs]def array_to_tensor(array: Array, **kwargs) -> torch.Tensor:
    """Turn an numpy or cupy array to a torch Tensor."""
    if isinstance(array, np.ndarray):
        return torch.from_numpy(array)
    elif cp is not None and isinstance(array, cp.ndarray):
        if array.size == 0:
            return torch.zeros(
                size=array.shape,
                dtype=getattr(torch, str(array.dtype)),
                device="cuda",
                **kwargs,
            )
        else:
            return torch.as_tensor(array, device="cuda", **kwargs)
    else:
        raise TypeError()


[docs]def series_to_tensor(series: Series | DataFrame) -> torch.Tensor:
    """Turn a Pandas/cudf series into a Torch tensor"""
    array = series_to_array(series=series)
    return array_to_tensor(array)


if cp is not None:

[docs]    def tensor_to_cupy_array(tensor: torch.Tensor | cp.ndarray) -> cp.ndarray:
        """Turn a tensor on GPU into a cupy array.

        Notes:
            Handle the corny case of a boolean tensor
        """
        if isinstance(tensor, torch.Tensor):
            tensor = tensor.detach()
            if tensor.dtype == torch.bool:
                return cp.asarray(tensor.float(), dtype=bool)
            else:
                return cp.asarray(tensor)
        else:
            return cp.asarray(tensor)


[docs]def tensor_to_array(tensor: torch.Tensor) -> Array:
    """Turn a tensor into an array on CPU or GPU."""
    use_cuda = tensor.device.type == "cuda"
    if use_cuda:
        return tensor.numpy()
    else:
        return tensor_to_cupy_array(tensor=tensor)


@typing.overload
def to_dataframe(
    tensors: typing.Dict[str, torch.Tensor | Array],
    use_cuda: typing.Literal[True],
    index: torch.Tensor | Array | None = None,
) -> cudf.DataFrame:
    ...


@typing.overload
def to_dataframe(
    tensors: typing.Dict[str, torch.Tensor | Array],
    use_cuda: typing.Literal[False],
    index: torch.Tensor | Array | None = None,
) -> pd.DataFrame:
    ...


@typing.overload
def to_dataframe(
    tensors: typing.Dict[str, torch.Tensor | Array],
    use_cuda: bool,
    index: torch.Tensor | Array | None = None,
) -> DataFrame:
    ...


[docs]def to_dataframe(
    tensors: typing.Dict[str, torch.Tensor | Array],
    use_cuda: bool,
    index: torch.Tensor | Array | None = None,
) -> DataFrame:
    """Convert a dictionary of tensors / arrays into a dataframe on CPU or GPU."""
    if use_cuda:
        if cudf is not None:
            converted_tensors = {}
            for _, tensor in converted_tensors.items():
                if isinstance(tensor, torch.Tensor):
                    tensor.detach()

        return cudf.DataFrame(
            {
                column: (tensor_to_cupy_array(tensor=tensor))
                for column, tensor in tensors.items()
            },
            index=None if index is None else tensor_to_cupy_array(tensor=index),
        )  # type: ignore
    else:
        return pd.DataFrame(
            {
                column: (
                    tensor.numpy()
                    if isinstance(tensor, torch.Tensor)
                    else np.asarray(tensor)
                )
                for column, tensor in tensors.items()
            },
            index=None
            if index is None
            else (
                index.numpy() if isinstance(index, torch.Tensor) else np.asarray(index)
            ),
        )  # type: ignore


[docs]def get_numpy_or_cupy(use_cuda: bool):
    """Get either ``numpy`` (CPU) or ``cupy`` (GPU) according to whether cuda is used
    or not.
    """
    if use_cuda:
        if cp is not None:
            return cp
        else:
            raise Exception("Trying to use array on GPU but `cupy` is not installed.")

    else:
        return np


[docs]def get_pandas_or_cudf(use_cuda: bool):
    """Get either ``pandas`` (CPU) or ``cudf`` (GPU) according to whether cuda is used
    or not.
    """
    if use_cuda:
        if cudf is not None:
            return cudf
        else:
            raise Exception(
                "Trying to use a dataframe on GPU but `cudf` is not installed."
            )
    else:
        return pd


[docs]def get_use_cuda_from_dataframe(dataframe: DataFrame) -> bool:
    """Get whether a dataframe on GPU is being used."""
    if isinstance(dataframe, pd.DataFrame):
        return False
    if cudf is not None and isinstance(dataframe, cudf.DataFrame):
        return True
    else:
        raise TypeError()


[docs]def count_occurences(tensor: TensorOrArray) -> TensorOrArray:
    """Count the number of times an element of a tensor appears
    in this tensor.

    Args:
        tensor: Torch tensor

    Returns:
        For each element in `tensor`, number of times it appears
        in this tensor.
    """
    if isinstance(tensor, torch.Tensor):
        _, inverse_indices, counts = torch.unique(
            tensor,
            return_counts=True,
            return_inverse=True,
        )
    elif isinstance(tensor, np.ndarray):
        _, inverse_indices, counts = np.unique(
            tensor,
            return_counts=True,
            return_inverse=True,
        )
    elif cp is not None and isinstance(tensor, cp.ndarray):
        _, inverse_indices, counts = cp.unique(  # type: ignore
            tensor,
            return_counts=True,
            return_inverse=True,
        )
    else:
        raise TypeError()
    return counts[inverse_indices]  # type: ignore