Source code for pipeline.Preprocessing.run_preprocessing
"""This module defines how to run the pre-processing from a configuration file.
"""
from __future__ import annotations
import logging
import os.path as op
from copy import copy
from .preprocessing import preprocess
from utils.tools.tfiles import delete_directory
from utils.commonutils.config import load_config, resolve_relative_path, cdirs
from utils.commonutils.ctests import get_test_config_for_preprocessing
[docs]def run_preprocessing(
path_or_config: str | dict,
reproduce: bool = True,
raise_enough_events: bool = True,
):
"""Run the pre-processing step.
Args:
path_or_config: configuration dictionary, or path to the YAML file that contains
the configuration
reproduce: whether to reproduce an existing preprocessing
raise_enough_events: whether to raise an error if not any events where
generated.
"""
config = load_config(path_or_config)
output_dir = config["preprocessing"]["output_dir"]
if config["preprocessing"]["n_events"] is None:
config["preprocessing"]["n_events"] = (
config["processing"]["n_train_events"]
+ config["processing"]["n_val_events"]
)
if reproduce:
delete_directory(output_dir)
# Define parquet files to use
detector = config["common"].get("detector", cdirs.detectors[0])
preprocessing_config = copy(config["preprocessing"])
filenames = cdirs.get_filenames_from_detector(detector=detector)
for filename_key in ("hits_particles_filename", "particles_filename"):
if filename_key not in preprocessing_config:
preprocessing_config[filename_key] = filenames[filename_key]
if op.exists(op.join(output_dir, "done")):
logging.info(
f"Output directory {output_dir} exists and is not empty. "
"Thus, the preprocessing was not run. "
"Please use `reproduce=True` if you need to run the preprocessing again."
)
else:
preprocess(raise_enough_events=raise_enough_events, **preprocessing_config)
[docs]def run_preprocessing_test_dataset(
test_dataset_name: str,
path_or_config_test: str | dict | None = None,
detector: str | None = None,
reproduce: bool = False,
raise_enough_events: bool = False,
):
"""Run the pre-processing of a test dataset.
Args:
test_dataset_name: name of the test dataset to pre-process
path_or_config_test: YAML test dataset configuration dictionary or path to it
reproduce: whether to reproduce an existing preprocessing
raise_enough_events: whether to raise an error if not any events where
generated.
"""
if detector is None:
detector = cdirs.detectors[0]
if path_or_config_test is None:
path_or_config_test = cdirs.test_config_path
config = get_test_config_for_preprocessing(
test_dataset_name,
path_or_config_test=path_or_config_test,
detector=detector,
)
run_preprocessing(
path_or_config=config,
reproduce=reproduce,
raise_enough_events=raise_enough_events,
)