Source code for pipeline.Preprocessing.balancing
"""A module that implements function to balance the dataset using weights.
"""
import typing
import numpy as np
import numpy.typing as npt
[docs]def compute_balancing_weights(
array: npt.ArrayLike,
nbins: typing.Optional[int] = None,
) -> npt.NDArray:
"""Compute balancing weights so that histogram bins in ``array`` are same-sized with
the weights.
Args:
array: array of values of interest, which will be histogrammised
nbins: number of bins in the histogram
Returns:
Array of weights for every value in ``array``
"""
array = np.asarray(array)
if nbins is None:
nbins = round(np.sqrt(array.size))
# Histogrammise variable
counts, bin_edges = np.histogram(array, bins=nbins)
# Compute weights for each bin
with np.errstate(divide="ignore", invalid="ignore"):
bin_weights = 1 / counts
# Associate a value in `array` with a bin
bin_indices = np.digitize(array, bin_edges[:-1]) - 1
bin_indices = np.where(bin_indices == -1, 0, bin_indices)
# Return normalised weights
weights = bin_weights[bin_indices]
return array.size * weights / weights.sum()