"""A module that allows to handle conversion between tensors, arrays and dataframes,
on CPU (numpy and pandas) or GPU (cupy and cudf).
"""
from __future__ import annotations
import typing
import numpy as np
import pandas as pd
import torch
try:
import cupy as cp
except ImportError:
cp = None
try:
import cudf
except ImportError:
cudf = None
Array = (
typing.TypeVar("Array", np.ndarray, cp.ndarray) if cp is not None else np.ndarray
)
TensorOrArray = (
typing.TypeVar("TensorOrArray", torch.Tensor, np.ndarray, cp.ndarray)
if cp is not None
else typing.TypeVar("TensorOrArray", torch.Tensor, np.ndarray)
)
Series = (
typing.TypeVar("Series", pd.Series, cudf.Series) if cudf is not None else pd.Series
)
DataFrame = (
typing.TypeVar("DataFrame", pd.DataFrame, cudf.DataFrame)
if cudf is not None
else pd.DataFrame
)
[docs]def series_to_array(series: Series | DataFrame) -> Array:
"""Turn a Pandas/cudf dataframe or series into a numpy/cupy array."""
if isinstance(series, (pd.Series, pd.DataFrame)):
return series.to_numpy() # type: ignore
if cudf is not None and isinstance(series, (cudf.Series, cudf.DataFrame)):
array = series.to_cupy()
return array
else:
raise TypeError()
[docs]def array_to_tensor(array: Array, **kwargs) -> torch.Tensor:
"""Turn an numpy or cupy array to a torch Tensor."""
if isinstance(array, np.ndarray):
return torch.from_numpy(array)
elif cp is not None and isinstance(array, cp.ndarray):
if array.size == 0:
return torch.zeros(
size=array.shape,
dtype=getattr(torch, str(array.dtype)),
device="cuda",
**kwargs,
)
else:
return torch.as_tensor(array, device="cuda", **kwargs)
else:
raise TypeError()
[docs]def series_to_tensor(series: Series | DataFrame) -> torch.Tensor:
"""Turn a Pandas/cudf series into a Torch tensor"""
array = series_to_array(series=series)
return array_to_tensor(array)
if cp is not None:
[docs] def tensor_to_cupy_array(tensor: torch.Tensor | cp.ndarray) -> cp.ndarray:
"""Turn a tensor on GPU into a cupy array.
Notes:
Handle the corny case of a boolean tensor
"""
if isinstance(tensor, torch.Tensor):
tensor = tensor.detach()
if tensor.dtype == torch.bool:
return cp.asarray(tensor.float(), dtype=bool)
else:
return cp.asarray(tensor)
else:
return cp.asarray(tensor)
[docs]def tensor_to_array(tensor: torch.Tensor) -> Array:
"""Turn a tensor into an array on CPU or GPU."""
use_cuda = tensor.device.type == "cuda"
if use_cuda:
return tensor.numpy()
else:
return tensor_to_cupy_array(tensor=tensor)
@typing.overload
def to_dataframe(
tensors: typing.Dict[str, torch.Tensor | Array],
use_cuda: typing.Literal[True],
index: torch.Tensor | Array | None = None,
) -> cudf.DataFrame:
...
@typing.overload
def to_dataframe(
tensors: typing.Dict[str, torch.Tensor | Array],
use_cuda: typing.Literal[False],
index: torch.Tensor | Array | None = None,
) -> pd.DataFrame:
...
@typing.overload
def to_dataframe(
tensors: typing.Dict[str, torch.Tensor | Array],
use_cuda: bool,
index: torch.Tensor | Array | None = None,
) -> DataFrame:
...
[docs]def to_dataframe(
tensors: typing.Dict[str, torch.Tensor | Array],
use_cuda: bool,
index: torch.Tensor | Array | None = None,
) -> DataFrame:
"""Convert a dictionary of tensors / arrays into a dataframe on CPU or GPU."""
if use_cuda:
if cudf is not None:
converted_tensors = {}
for _, tensor in converted_tensors.items():
if isinstance(tensor, torch.Tensor):
tensor.detach()
return cudf.DataFrame(
{
column: (tensor_to_cupy_array(tensor=tensor))
for column, tensor in tensors.items()
},
index=None if index is None else tensor_to_cupy_array(tensor=index),
) # type: ignore
else:
return pd.DataFrame(
{
column: (
tensor.numpy()
if isinstance(tensor, torch.Tensor)
else np.asarray(tensor)
)
for column, tensor in tensors.items()
},
index=None
if index is None
else (
index.numpy() if isinstance(index, torch.Tensor) else np.asarray(index)
),
) # type: ignore
[docs]def get_numpy_or_cupy(use_cuda: bool):
"""Get either ``numpy`` (CPU) or ``cupy`` (GPU) according to whether cuda is used
or not.
"""
if use_cuda:
if cp is not None:
return cp
else:
raise Exception("Trying to use array on GPU but `cupy` is not installed.")
else:
return np
[docs]def get_pandas_or_cudf(use_cuda: bool):
"""Get either ``pandas`` (CPU) or ``cudf`` (GPU) according to whether cuda is used
or not.
"""
if use_cuda:
if cudf is not None:
return cudf
else:
raise Exception(
"Trying to use a dataframe on GPU but `cudf` is not installed."
)
else:
return pd
[docs]def get_use_cuda_from_dataframe(dataframe: DataFrame) -> bool:
"""Get whether a dataframe on GPU is being used."""
if isinstance(dataframe, pd.DataFrame):
return False
if cudf is not None and isinstance(dataframe, cudf.DataFrame):
return True
else:
raise TypeError()
[docs]def count_occurences(tensor: TensorOrArray) -> TensorOrArray:
"""Count the number of times an element of a tensor appears
in this tensor.
Args:
tensor: Torch tensor
Returns:
For each element in `tensor`, number of times it appears
in this tensor.
"""
if isinstance(tensor, torch.Tensor):
_, inverse_indices, counts = torch.unique(
tensor,
return_counts=True,
return_inverse=True,
)
elif isinstance(tensor, np.ndarray):
_, inverse_indices, counts = np.unique(
tensor,
return_counts=True,
return_inverse=True,
)
elif cp is not None and isinstance(tensor, cp.ndarray):
_, inverse_indices, counts = cp.unique( # type: ignore
tensor,
return_counts=True,
return_inverse=True,
)
else:
raise TypeError()
return counts[inverse_indices] # type: ignore