Source code for pipeline.Preprocessing.run_preprocessing

"""This module defines how to run the pre-processing from a configuration file.
"""

from __future__ import annotations
import logging
import os.path as op
from copy import copy

from .preprocessing import preprocess
from utils.tools.tfiles import delete_directory
from utils.commonutils.config import load_config, resolve_relative_path, cdirs
from utils.commonutils.ctests import get_test_config_for_preprocessing


[docs]def run_preprocessing( path_or_config: str | dict, reproduce: bool = True, raise_enough_events: bool = True, ): """Run the pre-processing step. Args: path_or_config: configuration dictionary, or path to the YAML file that contains the configuration reproduce: whether to reproduce an existing preprocessing raise_enough_events: whether to raise an error if not any events where generated. """ config = load_config(path_or_config) output_dir = config["preprocessing"]["output_dir"] if config["preprocessing"]["n_events"] is None: config["preprocessing"]["n_events"] = ( config["processing"]["n_train_events"] + config["processing"]["n_val_events"] ) if reproduce: delete_directory(output_dir) # Define parquet files to use detector = config["common"].get("detector", cdirs.detectors[0]) preprocessing_config = copy(config["preprocessing"]) filenames = cdirs.get_filenames_from_detector(detector=detector) for filename_key in ("hits_particles_filename", "particles_filename"): if filename_key not in preprocessing_config: preprocessing_config[filename_key] = filenames[filename_key] if op.exists(op.join(output_dir, "done")): logging.info( f"Output directory {output_dir} exists and is not empty. " "Thus, the preprocessing was not run. " "Please use `reproduce=True` if you need to run the preprocessing again." ) else: preprocess(raise_enough_events=raise_enough_events, **preprocessing_config)
[docs]def run_preprocessing_test_dataset( test_dataset_name: str, path_or_config_test: str | dict | None = None, detector: str | None = None, reproduce: bool = False, raise_enough_events: bool = False, ): """Run the pre-processing of a test dataset. Args: test_dataset_name: name of the test dataset to pre-process path_or_config_test: YAML test dataset configuration dictionary or path to it reproduce: whether to reproduce an existing preprocessing raise_enough_events: whether to raise an error if not any events where generated. """ if detector is None: detector = cdirs.detectors[0] if path_or_config_test is None: path_or_config_test = cdirs.test_config_path config = get_test_config_for_preprocessing( test_dataset_name, path_or_config_test=path_or_config_test, detector=detector, ) run_preprocessing( path_or_config=config, reproduce=reproduce, raise_enough_events=raise_enough_events, )