Source code for pipeline.utils.commonutils.cfeatures

"""A module that defines common utilies for data-handling.
"""
from __future__ import annotations
import typing
import torch
from torch_geometric.data import Data

from utils.commonutils.config import load_config


[docs]def get_input_features(
    all_features: torch.Tensor, feature_indices: int | typing.List[int] | None
) -> torch.Tensor:
    """Extract the features that are trained on, from the ``batch`` pytorch geometric
    data object.

    Args:
        batch: all features
        feature_indices: if it is an integer, corresponds to the number of features
            to include in the array of features.
            If it is a list of integers, it corresponds to the indices of the features
            to include in ``all_features``

    Returns:
        Array of features
    """
    if feature_indices is None:
        input_features = all_features
    elif isinstance(feature_indices, int):
        assert feature_indices > 0, (
            "If `feature_indices` is an integer, it should be strictly " "positive. "
        )
        assert feature_indices <= all_features.shape[1], (
            f"`feature_indices` was set to {feature_indices}, but this "
            "number is larger than the number of columns in `all_features`."
        )
        input_features = all_features[:, :feature_indices]
    elif isinstance(feature_indices, list):
        input_features = torch.cat(
            tuple(all_features[:, i] for i in feature_indices), dim=-1
        )
    else:
        raise TypeError(
            "The type of `feature_indices` is "
            f"{type(feature_indices).__name__}, which is not supported."
        )

    # TODO: remove after test
    assert not torch.any(input_features != input_features)
    return input_features


[docs]def get_number_input_features(feature_indices: int | typing.List[int]) -> int:
    """Get the number of input features.

    Args:
        feature_indices: if it is an integer, corresponds to the number of features
            to include in the array of features.
            If it is a list of integers, it corresponds to the indices of the features
            to include in ``batch.x``

    Returns:
        Number of input features
    """
    if isinstance(feature_indices, int):
        return feature_indices
    elif isinstance(feature_indices, list):
        return len(feature_indices)
    else:
        raise TypeError(
            "The type of `feature_indices` is "
            f"{type(feature_indices).__name__}, which is not supported."
        )


[docs]def get_unnormalised_features(
    batch: Data, path_or_config: str | dict, feature_names: typing.List[str]
) -> typing.List[torch.Tensor]:
    """Get the unnormalised features from the PyTorch Geometric data object,
    according to the configuration.

    Args:
        batch: PyTorch geometric data object, that contains the ``x`` attribute,
            which corresponds to the array of the features
        path_or_config: configuration dictionary, or path to the YAML file that contains
            the configuration
        feature_names: list of the names of the features to extract the unnormalised
            values of

    Returns:
        List of PyTorch tensors, corresponding the the arrays of values of the
        features whose names are given by ``features_names``
    """
    configs = load_config(path_or_config)

    all_feature_names = configs["processing"]["features"]
    all_feature_means = configs["processing"]["feature_means"]
    all_feature_scales = configs["processing"]["feature_scales"]

    list_feature_values = []
    for feature_name in feature_names:
        feature_idx = all_feature_names.index(feature_name)
        mean = all_feature_means[feature_idx]
        scale = all_feature_scales[feature_idx]

        array_values = batch.x[:, feature_idx].cpu()
        list_feature_values.append(array_values * scale + mean)

    return list_feature_values