Source code for pipeline.Preprocessing.balancing

"""A module that implements function to balance the dataset using weights.
"""
import typing
import numpy as np
import numpy.typing as npt


[docs]def compute_balancing_weights( array: npt.ArrayLike, nbins: typing.Optional[int] = None, ) -> npt.NDArray: """Compute balancing weights so that histogram bins in ``array`` are same-sized with the weights. Args: array: array of values of interest, which will be histogrammised nbins: number of bins in the histogram Returns: Array of weights for every value in ``array`` """ array = np.asarray(array) if nbins is None: nbins = round(np.sqrt(array.size)) # Histogrammise variable counts, bin_edges = np.histogram(array, bins=nbins) # Compute weights for each bin with np.errstate(divide="ignore", invalid="ignore"): bin_weights = 1 / counts # Associate a value in `array` with a bin bin_indices = np.digitize(array, bin_edges[:-1]) - 1 bin_indices = np.where(bin_indices == -1, 0, bin_indices) # Return normalised weights weights = bin_weights[bin_indices] return array.size * weights / weights.sum()