Source code for pipeline.Preprocessing.run_preprocessing

"""This module defines how to run the pre-processing from a configuration file.
"""

from __future__ import annotations
import logging
import os.path as op
from copy import copy

from .preprocessing import preprocess
from utils.tools.tfiles import delete_directory
from utils.commonutils.config import load_config, resolve_relative_path, cdirs
from utils.commonutils.ctests import get_test_config_for_preprocessing


[docs]def run_preprocessing(
    path_or_config: str | dict,
    reproduce: bool = True,
    raise_enough_events: bool = True,
):
    """Run the pre-processing step.

    Args:
        path_or_config: configuration dictionary, or path to the YAML file that contains
            the configuration
        reproduce: whether to reproduce an existing preprocessing
        raise_enough_events: whether to raise an error if not any events where
            generated.
    """
    config = load_config(path_or_config)
    output_dir = config["preprocessing"]["output_dir"]

    if config["preprocessing"]["n_events"] is None:
        config["preprocessing"]["n_events"] = (
            config["processing"]["n_train_events"]
            + config["processing"]["n_val_events"]
        )

    if reproduce:
        delete_directory(output_dir)

    # Define parquet files to use
    detector = config["common"].get("detector", cdirs.detectors[0])
    preprocessing_config = copy(config["preprocessing"])
    filenames = cdirs.get_filenames_from_detector(detector=detector)
    for filename_key in ("hits_particles_filename", "particles_filename"):
        if filename_key not in preprocessing_config:
            preprocessing_config[filename_key] = filenames[filename_key]

    if op.exists(op.join(output_dir, "done")):
        logging.info(
            f"Output directory {output_dir} exists and is not empty. "
            "Thus, the preprocessing was not run. "
            "Please use `reproduce=True` if you need to run the preprocessing again."
        )
    else:
        preprocess(raise_enough_events=raise_enough_events, **preprocessing_config)


[docs]def run_preprocessing_test_dataset(
    test_dataset_name: str,
    path_or_config_test: str | dict | None = None,
    detector: str | None = None,
    reproduce: bool = False,
    raise_enough_events: bool = False,
):
    """Run the pre-processing of a test dataset.

    Args:
        test_dataset_name: name of the test dataset to pre-process
        path_or_config_test: YAML test dataset configuration dictionary or path to it
        reproduce: whether to reproduce an existing preprocessing
        raise_enough_events: whether to raise an error if not any events where
            generated.
    """
    if detector is None:
        detector = cdirs.detectors[0]

    if path_or_config_test is None:
        path_or_config_test = cdirs.test_config_path
    config = get_test_config_for_preprocessing(
        test_dataset_name,
        path_or_config_test=path_or_config_test,
        detector=detector,
    )
    run_preprocessing(
        path_or_config=config,
        reproduce=reproduce,
        raise_enough_events=raise_enough_events,
    )