Source code for pipeline.utils.tools.tarray

"""A module that allows to handle conversion between tensors, arrays and dataframes,
on CPU (numpy and pandas) or GPU (cupy and cudf).
"""
from __future__ import annotations
import typing
import numpy as np
import pandas as pd
import torch

try:
    import cupy as cp

except ImportError:
    cp = None

try:
    import cudf

except ImportError:
    cudf = None


Array = (
    typing.TypeVar("Array", np.ndarray, cp.ndarray) if cp is not None else np.ndarray
)
TensorOrArray = (
    typing.TypeVar("TensorOrArray", torch.Tensor, np.ndarray, cp.ndarray)
    if cp is not None
    else typing.TypeVar("TensorOrArray", torch.Tensor, np.ndarray)
)

Series = (
    typing.TypeVar("Series", pd.Series, cudf.Series) if cudf is not None else pd.Series
)
DataFrame = (
    typing.TypeVar("DataFrame", pd.DataFrame, cudf.DataFrame)
    if cudf is not None
    else pd.DataFrame
)


[docs]def series_to_array(series: Series | DataFrame) -> Array: """Turn a Pandas/cudf dataframe or series into a numpy/cupy array.""" if isinstance(series, (pd.Series, pd.DataFrame)): return series.to_numpy() # type: ignore if cudf is not None and isinstance(series, (cudf.Series, cudf.DataFrame)): array = series.to_cupy() return array else: raise TypeError()
[docs]def array_to_tensor(array: Array, **kwargs) -> torch.Tensor: """Turn an numpy or cupy array to a torch Tensor.""" if isinstance(array, np.ndarray): return torch.from_numpy(array) elif cp is not None and isinstance(array, cp.ndarray): if array.size == 0: return torch.zeros( size=array.shape, dtype=getattr(torch, str(array.dtype)), device="cuda", **kwargs, ) else: return torch.as_tensor(array, device="cuda", **kwargs) else: raise TypeError()
[docs]def series_to_tensor(series: Series | DataFrame) -> torch.Tensor: """Turn a Pandas/cudf series into a Torch tensor""" array = series_to_array(series=series) return array_to_tensor(array)
if cp is not None:
[docs] def tensor_to_cupy_array(tensor: torch.Tensor | cp.ndarray) -> cp.ndarray: """Turn a tensor on GPU into a cupy array. Notes: Handle the corny case of a boolean tensor """ if isinstance(tensor, torch.Tensor): tensor = tensor.detach() if tensor.dtype == torch.bool: return cp.asarray(tensor.float(), dtype=bool) else: return cp.asarray(tensor) else: return cp.asarray(tensor)
[docs]def tensor_to_array(tensor: torch.Tensor) -> Array: """Turn a tensor into an array on CPU or GPU.""" use_cuda = tensor.device.type == "cuda" if use_cuda: return tensor.numpy() else: return tensor_to_cupy_array(tensor=tensor)
@typing.overload def to_dataframe( tensors: typing.Dict[str, torch.Tensor | Array], use_cuda: typing.Literal[True], index: torch.Tensor | Array | None = None, ) -> cudf.DataFrame: ... @typing.overload def to_dataframe( tensors: typing.Dict[str, torch.Tensor | Array], use_cuda: typing.Literal[False], index: torch.Tensor | Array | None = None, ) -> pd.DataFrame: ... @typing.overload def to_dataframe( tensors: typing.Dict[str, torch.Tensor | Array], use_cuda: bool, index: torch.Tensor | Array | None = None, ) -> DataFrame: ...
[docs]def to_dataframe( tensors: typing.Dict[str, torch.Tensor | Array], use_cuda: bool, index: torch.Tensor | Array | None = None, ) -> DataFrame: """Convert a dictionary of tensors / arrays into a dataframe on CPU or GPU.""" if use_cuda: if cudf is not None: converted_tensors = {} for _, tensor in converted_tensors.items(): if isinstance(tensor, torch.Tensor): tensor.detach() return cudf.DataFrame( { column: (tensor_to_cupy_array(tensor=tensor)) for column, tensor in tensors.items() }, index=None if index is None else tensor_to_cupy_array(tensor=index), ) # type: ignore else: return pd.DataFrame( { column: ( tensor.numpy() if isinstance(tensor, torch.Tensor) else np.asarray(tensor) ) for column, tensor in tensors.items() }, index=None if index is None else ( index.numpy() if isinstance(index, torch.Tensor) else np.asarray(index) ), ) # type: ignore
[docs]def get_numpy_or_cupy(use_cuda: bool): """Get either ``numpy`` (CPU) or ``cupy`` (GPU) according to whether cuda is used or not. """ if use_cuda: if cp is not None: return cp else: raise Exception("Trying to use array on GPU but `cupy` is not installed.") else: return np
[docs]def get_pandas_or_cudf(use_cuda: bool): """Get either ``pandas`` (CPU) or ``cudf`` (GPU) according to whether cuda is used or not. """ if use_cuda: if cudf is not None: return cudf else: raise Exception( "Trying to use a dataframe on GPU but `cudf` is not installed." ) else: return pd
[docs]def get_use_cuda_from_dataframe(dataframe: DataFrame) -> bool: """Get whether a dataframe on GPU is being used.""" if isinstance(dataframe, pd.DataFrame): return False if cudf is not None and isinstance(dataframe, cudf.DataFrame): return True else: raise TypeError()
[docs]def count_occurences(tensor: TensorOrArray) -> TensorOrArray: """Count the number of times an element of a tensor appears in this tensor. Args: tensor: Torch tensor Returns: For each element in `tensor`, number of times it appears in this tensor. """ if isinstance(tensor, torch.Tensor): _, inverse_indices, counts = torch.unique( tensor, return_counts=True, return_inverse=True, ) elif isinstance(tensor, np.ndarray): _, inverse_indices, counts = np.unique( tensor, return_counts=True, return_inverse=True, ) elif cp is not None and isinstance(tensor, cp.ndarray): _, inverse_indices, counts = cp.unique( # type: ignore tensor, return_counts=True, return_inverse=True, ) else: raise TypeError() return counts[inverse_indices] # type: ignore