From c518da0ea295be5f102ccff0d6217c0f7fd67cef Mon Sep 17 00:00:00 2001 From: Gernot Maier Date: Sat, 20 Dec 2025 16:41:22 +0100 Subject: [PATCH] Remove XGB stuff moved to Eventdisplay-ML --- environment.yml | 29 -- py/applyXGBoostforDirection.py | 492 ---------------------- py/tests/test_applyXGBoostforDirection.py | 100 ----- py/trainXGBoostforDirection.py | 437 ------------------- py/training_variables.py | 28 -- 5 files changed, 1086 deletions(-) delete mode 100644 environment.yml delete mode 100644 py/applyXGBoostforDirection.py delete mode 100644 py/tests/test_applyXGBoostforDirection.py delete mode 100644 py/trainXGBoostforDirection.py delete mode 100644 py/training_variables.py diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 9705ffd5..00000000 --- a/environment.yml +++ /dev/null @@ -1,29 +0,0 @@ ---- -name: eventdisplay_v4 -channels: - - conda-forge -dependencies: - - python=3.13 - - awkward - - awkward-pandas - - joblib - - matplotlib - - numpy - - pandas - - pre-commit - - pytest - - pytest-cov - - pytest-mock - - scikit-learn - - scipy - - shellcheck - - tabulate - - towncrier - - uproot - - xgboost - -# cheatsheet -# create: conda env create -f environment.yml -# activate: conda activate eventdisplay_v4 -# update (conda/mamba): conda env update -f environment.yml --prune -# update (micromamba): micromamba update -f environment.yml diff --git a/py/applyXGBoostforDirection.py b/py/applyXGBoostforDirection.py deleted file mode 100644 index b65c8c2c..00000000 --- a/py/applyXGBoostforDirection.py +++ /dev/null @@ -1,492 +0,0 @@ -""" -Evaluate XGBoost BDTs for Direction reconstruction - -This script applies trained XGBoost models to predict Xoff and Yoff -for each event in an input ROOT file. The output ROOT file contains -one row per input event, maintaining the original event order. -""" - -import argparse -import logging -import os -import time - -import joblib -import numpy as np -import pandas as pd -import uproot -from training_variables import xgb_training_variables - -logging.basicConfig(level=logging.INFO) -_logger = logging.getLogger("applyXGBoostforDirection") - - -def get_branch_list(): - """Branches required for inference.""" - return [ - "DispNImages", - "DispTelList_T", - "Xoff", - "Yoff", - "Xoff_intersect", - "Yoff_intersect", - "fpointing_dx", - "fpointing_dy", - ] + [var for var in xgb_training_variables()] - - -def parse_image_selection(image_selection_str): - """ - Parse the image_selection parameter. - - Can be either: - - Bit-coded value (e.g., 14 = 0b1110 = telescopes 1,2,3) - - Comma-separated indices (e.g., "1,2,3") - - Returns a list of telescope indices. - """ - if not image_selection_str: - return None - - # Parse as comma-separated indices - if "," in image_selection_str: - try: - indices = [int(x.strip()) for x in image_selection_str.split(",")] - _logger.info(f"Image selection indices: {indices}") - return indices - except ValueError: - pass - - # Parse as bit-coded value - try: - bit_value = int(image_selection_str) - # Extract bit positions (0-indexed) - indices = [i for i in range(4) if (bit_value >> i) & 1] - _logger.info(f"Image selection from bit-coded value {bit_value}: {indices}") - return indices - except ValueError: - raise ValueError( - f"Invalid image_selection format: {image_selection_str}. " - "Use bit-coded value (e.g., 14) or comma-separated indices (e.g., '1,2,3')" - ) - - -def apply_image_selection(df, selected_indices): - """Filter and pad telescope lists for selected indices.""" - if selected_indices is None or len(selected_indices) == 4: - return df - - selected_set = set(selected_indices) - - def calculate_intersection(tel_list): - return [tel_idx for tel_idx in tel_list if tel_idx in selected_set] - - df = df.copy() - df["DispTelList_T_new"] = df["DispTelList_T"].apply(calculate_intersection) - df["DispNImages_new"] = df["DispTelList_T_new"].apply(len) - - _logger.info( - f"\n{df[['DispNImages', 'DispTelList_T', 'DispNImages_new', 'DispTelList_T_new']].head(20).to_string()}" - ) - - df["DispTelList_T"] = df["DispTelList_T_new"] - df["DispNImages"] = df["DispNImages_new"] - df = df.drop(columns=["DispTelList_T_new", "DispNImages_new"]) - - for var_name in xgb_training_variables(): - if var_name in df.columns: - df[var_name] = df[var_name].apply( - lambda x: np.pad( - np.array(x), - (0, 4 - len(x)), - mode="constant", - constant_values=np.nan, - ) - if isinstance(x, (list, np.ndarray)) - else x - ) - - return df - - -def filter_by_telescope_selection(df, selected_indices): - """Return a boolean mask: True when all selected telescopes are present or event has 4 images.""" - if not selected_indices: - return pd.Series([True] * len(df), index=df.index) - - selected_set = set(selected_indices) - - def ok(tel_list): - if len(tel_list) >= 4: - return True - return selected_set.issubset(set(tel_list)) - - return df["DispTelList_T"].apply(ok) - - -def flatten_data_vectorized(df, n_tel, training_variables): - """ - Vectorized flattening of telescope array columns. - Significantly faster than row-by-row iteration. - """ - flat_features = {} - tel_list_col = "DispTelList_T" - - def to_padded_array(arrays): - """Convert list of variable-length arrays to fixed-size numpy array, padding with NaN.""" - max_len = max(len(arr) if hasattr(arr, "__len__") else 1 for arr in arrays) - result = np.full((len(arrays), max_len), np.nan) - for i, arr in enumerate(arrays): - if hasattr(arr, "__len__"): - result[i, : len(arr)] = arr - else: - result[i, 0] = arr - return result - - def to_dense_array(col): - """ - Convert a column of variable-length telescope data to a dense 2D numpy array. - - Handles uproot's awkward-style variable-length arrays from ROOT files - by converting to plain Python lists first to avoid per-element iteration overhead. - """ - # Convert to a plain Python list first to avoid per-element iteration overhead - # when dealing with uproot's awkward-style variable-length arrays - arrays = col.tolist() if hasattr(col, "tolist") else list(col) - try: - return np.vstack(arrays) - except (ValueError, TypeError): - return to_padded_array(arrays) - - tel_list_matrix = to_dense_array(df[tel_list_col]) - - for var_name in training_variables: - # Convert the column of arrays to a 2D numpy matrix - # Shape: (n_events, max_n_tel) - data_matrix = to_dense_array(df[var_name]) - - for i in range(n_tel): - col_name = f"{var_name}_{i}" - - if var_name.startswith("Disp"): - # Case 1: Simple index i - if i < data_matrix.shape[1]: - flat_features[col_name] = data_matrix[:, i] - else: - flat_features[col_name] = np.full(len(df), np.nan) - else: - # Case 2: Index lookup via DispTelList_T - target_tel_indices = tel_list_matrix[:, i].astype(int) - - row_indices = np.arange(len(df)) - valid_mask = (target_tel_indices >= 0) & ( - target_tel_indices < data_matrix.shape[1] - ) - result = np.full(len(df), np.nan) - result[valid_mask] = data_matrix[ - row_indices[valid_mask], target_tel_indices[valid_mask] - ] - - flat_features[col_name] = result - - # Convert dictionary to DataFrame once at the end - df_flat = pd.DataFrame(flat_features, index=df.index) - - # Ensure all columns are float type (uproot may load as awkward arrays) - for col in df_flat.columns: - df_flat[col] = df_flat[col].astype(np.float32) - - for i in range(n_tel): - df_flat[f"disp_x_{i}"] = df_flat[f"Disp_T_{i}"] * df_flat[f"cosphi_{i}"] - df_flat[f"disp_y_{i}"] = df_flat[f"Disp_T_{i}"] * df_flat[f"sinphi_{i}"] - df_flat[f"loss_loss_{i}"] = df_flat[f"loss_{i}"] ** 2 - df_flat[f"loss_dist{i}"] = df_flat[f"loss_{i}"] * df_flat[f"dist_{i}"] - df_flat[f"width_length_{i}"] = df_flat[f"width_{i}"] / ( - df_flat[f"length_{i}"] + 1e-6 - ) - df_flat[f"size_{i}"] = np.log10(df_flat[f"size_{i}"] + 1e-6) # avoid log(0) - # pointing corrections - df_flat[f"cen_x_{i}"] = df_flat[f"cen_x_{i}"] + df_flat[f"fpointing_dx_{i}"] - df_flat[f"cen_y_{i}"] = df_flat[f"cen_y_{i}"] + df_flat[f"fpointing_dy_{i}"] - - df_flat["Xoff_weighted_bdt"] = df["Xoff"].astype(np.float32) - df_flat["Yoff_weighted_bdt"] = df["Yoff"].astype(np.float32) - df_flat["Xoff_intersect"] = df["Xoff_intersect"].astype(np.float32) - df_flat["Yoff_intersect"] = df["Yoff_intersect"].astype(np.float32) - df_flat["Diff_Xoff"] = (df["Xoff"] - df["Xoff_intersect"]).astype(np.float32) - df_flat["Diff_Yoff"] = (df["Yoff"] - df["Yoff_intersect"]).astype(np.float32) - - return df_flat - - -def load_models(model_dir): - """ - Load XGBoost models for different telescope multiplicities from a directory. - - Parameters - ---------- - model_dir : str - Path to the directory containing the trained model files - named ``dispdir_bdt_ntel{n_tel}_xgboost.joblib``. - - Returns - ------- - dict[int, Any] - A dictionary mapping the number of telescopes (n_tel) to the - corresponding loaded model objects. Only models whose files - exist in ``model_dir`` are included. - """ - models = {} - for n_tel in range(2, 5): - model_filename = os.path.join( - model_dir, f"dispdir_bdt_ntel{n_tel}_xgboost.joblib" - ) - if os.path.exists(model_filename): - _logger.info(f"Loading model: {model_filename}") - models[n_tel] = joblib.load(model_filename) - else: - _logger.warning(f"Model not found: {model_filename}") - return models - - -def apply_models(df, models_or_dir, selection_mask=None): - """ - Apply trained XGBoost models to a DataFrame chunk. - - Parameters - ---------- - df : pandas.DataFrame - Chunk of events to process. Must contain at least the columns used in - ``xgb_training_variables()`` plus the pointing information - ("fpointing_dx", "fpointing_dy") and "DispNImages". - models_or_dir : dict[int, Any] or str - Either a preloaded models dictionary (as returned by :func:`load_models`) - or a path to a model directory. If a string is provided, models are - loaded on the fly to satisfy test expectations. - selection_mask : pandas.Series or None - Optional mask; False entries are marked with -999 in outputs. - - Returns - ------- - pred_xoff : numpy.ndarray - Array of predicted Xoff values for each event in the chunk, aligned - with the index of ``df``. - pred_yoff : numpy.ndarray - Array of predicted Yoff values for each event in the chunk, aligned - with the index of ``df``. - """ - n_events = len(df) - pred_xoff = np.full(n_events, np.nan, dtype=np.float32) - pred_yoff = np.full(n_events, np.nan, dtype=np.float32) - - if isinstance(models_or_dir, str): - models = load_models(models_or_dir) - else: - models = models_or_dir - - grouped = df.groupby("DispNImages") - - for n_tel, group_df in grouped: - n_tel = int(n_tel) - if int(n_tel) < 2: - continue - if n_tel not in models: - _logger.warning( - f"No model available for n_tel={n_tel}, skipping {len(group_df)} events" - ) - continue - - _logger.info(f"Processing {len(group_df)} events with n_tel={n_tel}") - - training_vars_with_pointing = xgb_training_variables() + [ - "fpointing_dx", - "fpointing_dy", - ] - df_flat = flatten_data_vectorized(group_df, n_tel, training_vars_with_pointing) - - excluded_columns = ["MCxoff", "MCyoff", "MCe0"] - for n in range(n_tel): - excluded_columns.append(f"fpointing_dx_{n}") - excluded_columns.append(f"fpointing_dy_{n}") - - feature_cols = [col for col in df_flat.columns if col not in excluded_columns] - X = df_flat[feature_cols] - - model = models[n_tel] - predictions = model.predict(X) - - for i, idx in enumerate(group_df.index): - pred_xoff[idx] = predictions[i, 0] - pred_yoff[idx] = predictions[i, 1] - - if selection_mask is not None: - pred_xoff = np.where(selection_mask, pred_xoff, -999.0) - pred_yoff = np.where(selection_mask, pred_yoff, -999.0) - - return pred_xoff, pred_yoff - - -def process_file_chunked( - input_file, - model_dir, - output_file, - image_selection, - max_events=None, - chunk_size=500000, -): - """ - Stream events from an input ROOT file in chunks, apply XGBoost models, - and write direction predictions to an output ROOT file. - - Parameters - ---------- - input_file : str - Path to the input ROOT file containing a "data" TTree. - model_dir : str - Directory containing the trained XGBoost model files named - ``dispdir_bdt_ntel{n_tel}_xgboost.joblib`` for different telescope - multiplicities. - output_file : str - Path to the output ROOT file to create. The file will contain a - "StereoAnalysis" TTree with ``Dir_Xoff`` and ``Dir_Yoff`` branches. - image_selection : str - String specifying which telescope indices to select, passed to - :func:`parse_image_selection` to obtain the corresponding indices - used by :func:`apply_image_selection`. - max_events : int, optional - Maximum number of events to process. If None (default), all - available events in the input file are processed. - chunk_size : int, optional - Number of events to read and process per chunk. Larger values reduce - I/O overhead but increase memory usage. Default is 500000. - - Returns - ------- - None - This function writes results directly to ``output_file`` and does not - return a value. - """ - models = load_models(model_dir) - branch_list = get_branch_list() - selected_indices = parse_image_selection(image_selection) - - _logger.info(f"Chunk size: {chunk_size}") - if max_events: - _logger.info(f"Maximum events to process: {max_events}") - - with uproot.recreate(output_file) as root_file: - tree = root_file.mktree( - "StereoAnalysis", - {"Dir_Xoff": np.float32, "Dir_Yoff": np.float32}, - ) - - total_processed = 0 - - for df_chunk in uproot.iterate( - f"{input_file}:data", - branch_list, - library="pd", - step_size=chunk_size, - ): - if df_chunk.empty: - continue - - df_chunk = apply_image_selection(df_chunk, selected_indices) - if df_chunk.empty: - continue - - # Stop early if we've reached max_events limit - if max_events is not None and total_processed >= max_events: - break - - # Reset index to local chunk indices (0, 1, 2, ...) to avoid - # index out-of-bounds when indexing chunk-sized output arrays - df_chunk = df_chunk.reset_index(drop=True) - - pred_xoff, pred_yoff = apply_models(df_chunk, models) - - tree.extend( - { - "Dir_Xoff": np.asarray(pred_xoff, dtype=np.float32), - "Dir_Yoff": np.asarray(pred_yoff, dtype=np.float32), - } - ) - - total_processed += len(df_chunk) - _logger.info(f"Processed {total_processed} events so far") - - _logger.info( - f"Streaming complete. Total processed events written: {total_processed}" - ) - - -def main(): - parser = argparse.ArgumentParser( - description=("Apply XGBoost Multi-Target BDTs for Stereo Reconstruction") - ) - parser.add_argument( - "--input-file", - required=True, - metavar="INPUT.root", - help="Path to input mscw ROOT file", - ) - parser.add_argument( - "--model-dir", - required=True, - metavar="MODEL_DIR", - help="Directory containing XGBoost models", - ) - parser.add_argument( - "--output-file", - required=True, - metavar="OUTPUT.root", - help="Output ROOT file path for predictions", - ) - parser.add_argument( - "--image-selection", - type=str, - default="15", - help=( - "Optional telescope selection. Can be bit-coded (e.g., 14 for telescopes 1,2,3) " - "or comma-separated indices (e.g., '1,2,3'). " - "Keeps events with all selected telescopes or 4-telescope events. " - "Default is 15 (0b1111), which selects all 4 telescopes." - ), - ) - parser.add_argument( - "--max-events", - type=int, - default=None, - help="Maximum number of events to process (default: all events)", - ) - parser.add_argument( - "--chunk-size", - type=int, - default=500000, - help="Number of events to process per chunk (default: 500000)", - ) - args = parser.parse_args() - - start_time = time.time() - _logger.info("--- XGBoost Multi-Target Direction Evaluation ---") - _logger.info(f"Input file: {args.input_file}") - _logger.info(f"Model directory: {args.model_dir}") - _logger.info(f"Output file: {args.output_file}") - if args.image_selection: - _logger.info(f"Image selection: {args.image_selection}") - - process_file_chunked( - input_file=args.input_file, - model_dir=args.model_dir, - output_file=args.output_file, - image_selection=args.image_selection, - max_events=args.max_events, - chunk_size=args.chunk_size, - ) - - elapsed_time = time.time() - start_time - _logger.info(f"Processing complete. Total time: {elapsed_time:.2f} seconds") - - -if __name__ == "__main__": - main() diff --git a/py/tests/test_applyXGBoostforDirection.py b/py/tests/test_applyXGBoostforDirection.py deleted file mode 100644 index 96de97e9..00000000 --- a/py/tests/test_applyXGBoostforDirection.py +++ /dev/null @@ -1,100 +0,0 @@ -import applyXGBoostforDirection as mod -import numpy as np -import pandas as pd - - -def test_parse_image_selection_indices(): - indices = mod.parse_image_selection("1,2,3") - assert indices == [1, 2, 3] - - -def test_parse_image_selection_bits(): - indices = mod.parse_image_selection("14") # 0b1110 -> [1,2,3] - assert indices == [1, 2, 3] - - -def test_filter_by_telescope_selection_returns_mask_and_preserves_length(): - df = pd.DataFrame( - { - "DispTelList_T": [ - [1, 2, 3], # has 1,2,3 - [1, 3], # missing 2 - [0, 1, 2, 3], # 4 telescopes -> always included - [0, 2], # missing 1,3 - ] - } - ) - selected = [1, 2, 3] - mask = mod.filter_by_telescope_selection(df, selected) - assert isinstance(mask, pd.Series) - assert len(mask) == len(df) - # Expect True for rows 0 and 2 - assert mask.tolist() == [True, False, True, False] - - -class DummyModel: - def __init__(self, out_val=(0.0, 0.0)): - self.out_val = np.array(out_val, dtype=float) - - def predict(self, X): - # Return shape (n_rows, 2) filled with out_val - n = len(X) - return np.tile(self.out_val, (n, 1)) - - -def test_apply_models_with_selection_mask(monkeypatch): - # Build a minimal DataFrame with required columns - df = pd.DataFrame( - { - "DispNImages": [3, 3, 4, 2, 3], - "DispTelList_T": [ - [1, 2, 3], # selected - [1, 3], # unselected - [0, 1, 2, 3], # 4 telescopes -> selected - [0, 1], # unselected - [1, 2, 3], # selected - ], - # Truth-related columns used downstream; values not relevant for this test - "Xoff": [0, 0, 0, 0, 0], - "Yoff": [0, 0, 0, 0, 0], - "Xoff_intersect": [0, 0, 0, 0, 0], - "Yoff_intersect": [0, 0, 0, 0, 0], - } - ) - - # Selection: require telescopes 1,2,3 or len==4 - selection_mask = mod.filter_by_telescope_selection(df, [1, 2, 3]) - - # Monkeypatch model loading and existence checks - monkeypatch.setattr(mod.os.path, "exists", lambda p: True) - - # Always return a DummyModel; value differentiates by n_tel if desired - def fake_load(path): - if "ntel4" in path: - return DummyModel(out_val=(4.0, -4.0)) - elif "ntel3" in path: - return DummyModel(out_val=(3.0, -3.0)) - else: - return DummyModel(out_val=(2.0, -2.0)) - - monkeypatch.setattr(mod.joblib, "load", fake_load) - - # Monkeypatch flattening to avoid complex array inputs - def fake_flatten(group_df, n_tel, training_vars): - # Provide a simple feature column not in excluded list - return pd.DataFrame({"feature": np.zeros(len(group_df))}, index=group_df.index) - - monkeypatch.setattr(mod, "flatten_data_vectorized", fake_flatten) - - pred_x, pred_y = mod.apply_models(df, "dummy_models_dir", selection_mask) - - # Output length must match input length - assert len(pred_x) == len(df) - assert len(pred_y) == len(df) - - # Unselected rows must be -999; selected rows come from model values - # From selection_mask above: rows 0,2,4 are selected; 1,3 are unselected - expected_x = [3.0, -999.0, 4.0, -999.0, 3.0] - expected_y = [-3.0, -999.0, -4.0, -999.0, -3.0] - assert np.allclose(pred_x, expected_x, equal_nan=False) - assert np.allclose(pred_y, expected_y, equal_nan=False) diff --git a/py/trainXGBoostforDirection.py b/py/trainXGBoostforDirection.py deleted file mode 100644 index a2d7c317..00000000 --- a/py/trainXGBoostforDirection.py +++ /dev/null @@ -1,437 +0,0 @@ -""" -Train XGBoost Multi-Target BDTs for Direction Reconstruction -================================================= - -Uses x,y offsets calculated from intersection and dispBDT methods plus -image parameters to train multi-target regression BDTs to predict x,y offsets. - -Separate BDTs are trained for 2, 3, and 4 telescope multiplicity events. - -""" - -import argparse -import logging -import os -from typing import Dict, List - -import numpy as np -import pandas as pd -import uproot -import xgboost as xgb -from joblib import dump -from sklearn.metrics import mean_absolute_error, mean_squared_error -from sklearn.model_selection import train_test_split -from sklearn.multioutput import MultiOutputRegressor -from training_variables import xgb_training_variables - -logging.basicConfig(level=logging.INFO) -_logger = logging.getLogger("trainXGBoostforDirection") - - -def load_and_flatten_data(input_files, n_tel, max_events, training_step=True): - """ - Reads the data from ROOT files, filters for the required multiplicity (n_tel), - and flattens the telescope-array features into a Pandas DataFrame. - """ - _logger.info(f"\n--- Loading and Flattening Data for n_tel = {n_tel} ---") - _logger.info( - f"Max events to process: {max_events if max_events > 0 else 'All available'}" - ) - - branch_list = [ - "DispNImages", - "DispTelList_T", - "Xoff", - "Yoff", - "Xoff_intersect", - "Yoff_intersect", - "MCxoff", - "MCyoff", - "MCe0", - ] + [var for var in xgb_training_variables()] - - dfs = [] - if max_events > 0: - max_events_per_file = max_events // len(input_files) - else: - max_events_per_file = None - - for f in input_files: - try: - with uproot.open(f) as root_file: - if "data" in root_file: - _logger.info(f"Processing file: {f}") - tree = root_file["data"] - df = tree.arrays(branch_list, library="pd") - df = df[df["DispNImages"] == n_tel] - _logger.info(f"Number of events after n_tel filter: {len(df)}") - if max_events_per_file and len(df) > max_events_per_file: - df = df.sample(n=max_events_per_file, random_state=42) - if not df.empty: - dfs.append(df) - else: - _logger.warning(f"File: {f} does not contain a 'data' tree.") - except Exception as e: - _logger.error(f"Error opening or reading file {f}: {e}") - - if len(dfs) == 0: - _logger.error("No data loaded from input files.") - return pd.DataFrame() - - data_tree = pd.concat(dfs, ignore_index=True) - _logger.info(f"Total events for n_tel={n_tel}: {len(data_tree)}") - - if data_tree.empty: - return pd.DataFrame() - - # Compute weights: - # - R (to reflect physical sky area) - # - E (to balance energy distribution) - sample_weights = ( - np.sqrt( - data_tree["MCxoff"] ** 2 + data_tree["MCyoff"] ** 2 - ) # * data_tree["MCe0"] - ) - - df_flat = flatten_data_vectorized(data_tree, n_tel, xgb_training_variables()) - - if training_step: - df_flat["MCxoff"] = data_tree["MCxoff"] - df_flat["MCyoff"] = data_tree["MCyoff"] - df_flat["MCe0"] = np.log10(data_tree["MCe0"]) - df_flat["sample_weight"] = sample_weights - - df_flat.dropna(inplace=True) - _logger.info(f"Final events for n_tel={n_tel} after cleanup: {len(df_flat)}") - - return df_flat - - -def flatten_data_vectorized(df, n_tel, training_variables): - """Vectorized flattening of telescope array columns.""" - flat_features = {} - - try: - tel_list_matrix = np.vstack(df["DispTelList_T"].values) - except ValueError: - tel_list_matrix = np.array(df["DispTelList_T"].tolist()) - - for var_name in training_variables: - # Convert the column of arrays to a 2D numpy matrix - # Shape: (n_events, max_n_tel) - try: - data_matrix = np.vstack(df[var_name].values) - except ValueError: - data_matrix = np.array(df[var_name].tolist()) - - for i in range(n_tel): - col_name = f"{var_name}_{i}" - - if var_name.startswith("Disp"): - # Case 1: Simple index i - if i < data_matrix.shape[1]: - flat_features[col_name] = data_matrix[:, i] - else: - flat_features[col_name] = np.full(len(df), np.nan) - else: - # Case 2: Index lookup via DispTelList_T - target_tel_indices = tel_list_matrix[:, i].astype(int) - - row_indices = np.arange(len(df)) - valid_mask = (target_tel_indices >= 0) & ( - target_tel_indices < data_matrix.shape[1] - ) - result = np.full(len(df), np.nan) - result[valid_mask] = data_matrix[ - row_indices[valid_mask], target_tel_indices[valid_mask] - ] - - flat_features[col_name] = result - - # Convert dictionary to DataFrame once at the end - df_flat = pd.DataFrame(flat_features, index=df.index) - - # Additional derived features to support finding certain event classes - for i in range(n_tel): - df_flat[f"disp_x_{i}"] = df_flat[f"Disp_T_{i}"] * df_flat[f"cosphi_{i}"] - df_flat[f"disp_y_{i}"] = df_flat[f"Disp_T_{i}"] * df_flat[f"sinphi_{i}"] - df_flat[f"loss_loss_{i}"] = df_flat[f"loss_{i}"] ** 2 - df_flat[f"loss_dist{i}"] = df_flat[f"loss_{i}"] * df_flat[f"dist_{i}"] - df_flat[f"width_length_{i}"] = df_flat[f"width_{i}"] / ( - df_flat[f"length_{i}"] + 1e-6 - ) - df_flat[f"size_{i}"] = np.log10(df_flat[f"size_{i}"] + 1e-6) # avoid log(0) - - df_flat["Xoff_weighted_bdt"] = df["Xoff"] - df_flat["Yoff_weighted_bdt"] = df["Yoff"] - df_flat["Xoff_intersect"] = df["Xoff_intersect"] - df_flat["Yoff_intersect"] = df["Yoff_intersect"] - df_flat["Diff_Xoff"] = df["Xoff"] - df["Xoff_intersect"] - df_flat["Diff_Yoff"] = df["Yoff"] - df["Yoff_intersect"] - - return df_flat - - -def train(df, n_tel, output_dir, train_test_fraction): - """ - Trains a single XGBoost model for multi-target regression (Xoff, Yoff). - - Parameters: - - df: Pandas DataFrame with training data. - - n_tel: Telescope multiplicity. - - output_dir: Directory to save the trained model. - - train_test_fraction: Fraction of data to use for training. - """ - if df.empty: - _logger.warning(f"Skipping training for n_tel={n_tel} due to empty data.") - return - - # Features (X) and targets (Y) - X_cols = [ - col - for col in df.columns - if col not in ["MCxoff", "MCyoff", "MCe0", "sample_weight"] - ] - X = df[X_cols] - Y = df[["MCxoff", "MCyoff"]] - - _logger.info(f"Training variables ({len(X_cols)}): {X_cols}") - - X_train, X_test, Y_train, Y_test, W_train, _ = train_test_split( - X, - Y, - df["sample_weight"], - test_size=1.0 - train_test_fraction, - random_state=None, - ) - - _logger.info( - f"n_tel={n_tel}: Training events: {len(X_train)}, Testing events: {len(X_test)}" - ) - - # Parse TMVA options (simplified mapping to XGBoost parameters) - # The default TMVA string is: - # !V:NTrees=800:BoostType=Grad:Shrinkage=0.1:MaxDepth=4:MinNodeSize=1.0% - - # Note: XGBoost defaults to gbtree (Gradient Boosting). - # MultiOutputRegressor requires a base estimator (e.g., plain XGBRegressor) - xgb_params = { - "n_estimators": 1000, - "learning_rate": 0.1, # Shrinkage - "max_depth": 5, - "min_child_weight": 1.0, # Equivalent to MinNodeSize=1.0% for XGBoost - "objective": "reg:squarederror", - # https://xgboosting.com/configure-xgboost-regpseudohubererror-objective/ - # "objective": "reg:pseudohubererror", - "n_jobs": 4, - "random_state": None, - "tree_method": "hist", - "subsample": 0.7, # Default sensible value - "colsample_bytree": 0.7, # Default sensible value - } - rf_params = { - "n_estimators": 1000, - "max_depth": None, - "max_features": "sqrt", - "min_samples_leaf": 5, - "n_jobs": 4, - "random_state": None, - } - # Configure models - # - xgboost default approach - configs = { - "xgboost": xgb.XGBRegressor(**xgb_params), - # "random_forest": RandomForestRegressor(**rf_params), - } - _logger.info( - f"Sample weights (not(!) used in training) - min: {W_train.min():.6f}, " - f"max: {W_train.max():.6f}, mean: {W_train.mean():.6f}" - ) - - for name, estimator in configs.items(): - _logger.info(f"Training with {name} for n_tel={n_tel}...") - _logger.info(f"parameters: {xgb_params if name == 'xgboost' else rf_params}") - model = MultiOutputRegressor(estimator) - model.fit(X_train, Y_train) - - output_filename = os.path.join( - output_dir, f"dispdir_bdt_ntel{n_tel}_{name}.joblib" - ) - dump(model, output_filename) - _logger.info(f"{name} model saved to: {output_filename}") - - evaluate_model(model, X_test, Y_test, df, X_cols, Y, name) - - -def evaluate_model(model, X_test, Y_test, df, X_cols, Y, name): - """ - Evaluates the trained model on the test set and prints performance metrics. - - """ - score = model.score(X_test, Y_test) - _logger.info(f"XGBoost Multi-Target R^2 Score (Testing Set): {score:.4f}") - Y_pred = model.predict(X_test) - mse_x = mean_squared_error(Y_test["MCxoff"], Y_pred[:, 0]) - mse_y = mean_squared_error(Y_test["MCyoff"], Y_pred[:, 1]) - _logger.info(f"{name} MSE (X_off): {mse_x:.4f}, MSE (Y_off): {mse_y:.4f}") - mae_x = mean_absolute_error(Y_test["MCxoff"], Y_pred[:, 0]) - mae_y = mean_absolute_error(Y_test["MCyoff"], Y_pred[:, 1]) - _logger.info(f"{name} MAE (X_off): {mae_x:.4f}, MAE (Y_off): {mae_y:.4f}") - - feature_importance(model, X_cols, Y.columns, name) - if name == "xgboost": - shap_feature_importance(model, X_test, Y.columns) - - angular_resolution( - Y_pred, - Y_test, - df, - percentiles=[68, 90, 95], - log_e_min=-1, - log_e_max=2, - n_bins=6, - name=name, - ) - - -def angular_resolution( - Y_pred, Y_test, df, percentiles, log_e_min, log_e_max, n_bins, name -): - """ - Computes and logs the angular resolution based on predicted and true offsets. - """ - results_df = pd.DataFrame( - { - "MCxoff_true": Y_test["MCxoff"].values, - "MCyoff_true": Y_test["MCyoff"].values, - "MCxoff_pred": Y_pred[:, 0], - "MCyoff_pred": Y_pred[:, 1], - "MCe0": df.loc[Y_test.index, "MCe0"].values, - } - ) - - results_df["DeltaTheta"] = np.sqrt( - (results_df["MCxoff_true"] - results_df["MCxoff_pred"]) ** 2 - + (results_df["MCyoff_true"] - results_df["MCyoff_pred"]) ** 2 - ) - - results_df["LogE"] = results_df["MCe0"] - bins = np.linspace(log_e_min, log_e_max, n_bins + 1) - results_df["E_bin"] = pd.cut(results_df["LogE"], bins=bins, include_lowest=True) - results_df.dropna(subset=["E_bin"], inplace=True) - - resolution_results: Dict[str, List[float]] = {} - mean_logE_by_bin = ( - results_df.groupby("E_bin", observed=False)["LogE"].mean().round(3) - ) - - # Group and calculate the specified percentile for DeltaTheta - for p in percentiles: - p_res = results_df.groupby("E_bin", observed=False)["DeltaTheta"].agg( - lambda x: np.percentile(x, p) if len(x) > 0 else np.nan - ) - resolution_results[f"Theta_{p}%"] = p_res.values - - output_df = pd.DataFrame(resolution_results, index=mean_logE_by_bin.index) - output_df.insert(0, "Mean Log10(E)", mean_logE_by_bin.values) - output_df.index.name = "Log10(E) Bin Range" - output_df = output_df.dropna() # Drop bins with no events - - _logger.info(f"\n--- {name} Angular Resolution vs. Log10(MCe0) ---") - _logger.info( - f"Calculated over {n_bins} bins between Log10(E) = {log_e_min} and {log_e_max}" - ) - _logger.info("\n" + output_df.to_markdown(floatfmt=".4f")) - - -def feature_importance(model, X_cols, target_names, name=None): - """ - Prints feature importance from the trained XGBoost model. - """ - _logger.info("--- XGBoost Multi-Regression Feature Importance ---") - for i, estimator in enumerate(model.estimators_): - target = target_names[i] - _logger.info(f"\n### {name} Importance for Target: **{target}**") - - importances = estimator.feature_importances_ - importance_df = pd.DataFrame({"Feature": X_cols, "Importance": importances}) - - importance_df = importance_df.sort_values(by="Importance", ascending=False) - _logger.info("\n" + importance_df.head(15).to_markdown(index=False)) - - -def shap_feature_importance(model, X, target_names, max_points=20000, n_top=25): - """ - Uses XGBoost's builtin SHAP (pred_contribs=True). - Avoids SHAP.TreeExplainer compatibility issues with XGBoost ≥1.7. - """ - - # Subsample - X_sample = X.sample(n=min(len(X), max_points), random_state=0) - - # Loop through estimators (one per target) - for i, est in enumerate(model.estimators_): - target = target_names[i] - - # Builtin XGBoost SHAP values (n_samples, n_features+1) - # Last column is the bias term → drop it - shap_vals = est.get_booster().predict(xgb.DMatrix(X_sample), pred_contribs=True) - shap_vals = shap_vals[:, :-1] # drop bias column - - # Global importance: mean(|SHAP|) - imp = np.abs(shap_vals).mean(axis=0) - idx = np.argsort(imp)[::-1] - - _logger.info(f"\n=== Builtin XGBoost SHAP Importance for {target} ===") - for j in idx[:n_top]: - _logger.info(f"{X.columns[j]:25s} {imp[j]:.6e}") - - -def main(): - parser = argparse.ArgumentParser( - description=("Train XGBoost Multi-Target BDTs for Direction Reconstruction") - ) - parser.add_argument("--input_file_list", help="List of input mscw ROOT files.") - parser.add_argument("--ntel", type=int, help="Telescope multiplicity (2, 3, or 4).") - parser.add_argument( - "--output_dir", help="Output directory for XGBoost models and weights." - ) - parser.add_argument( - "--train_test_fraction", - type=float, - help="Fraction of data for training (e.g., 0.5).", - ) - parser.add_argument( - "--max_events", - type=int, - help="Maximum number of events to process across all files.", - ) - - args = parser.parse_args() - - try: - with open(args.input_file_list, "r") as f: - input_files = [line.strip() for line in f if line.strip()] - except FileNotFoundError as exc: - raise FileNotFoundError( - f"Error: Input file list not found: {args.input_file_list}" - ) from exc - - if not os.path.exists(args.output_dir): - os.makedirs(args.output_dir) - - _logger.info("--- XGBoost Multi-Target Direction Training ---") - _logger.info(f"Input files: {len(input_files)}") - _logger.info(f"Telescope multiplicity: {args.ntel}") - _logger.info(f"Output directory: {args.output_dir}") - _logger.info( - f"Train vs test fraction: {args.train_test_fraction}, Max events: {args.max_events}" - ) - - df_flat = load_and_flatten_data(input_files, args.ntel, args.max_events) - train(df_flat, args.ntel, args.output_dir, args.train_test_fraction) - _logger.info("\nXGBoost model trained successfully.") - - -if __name__ == "__main__": - main() diff --git a/py/training_variables.py b/py/training_variables.py deleted file mode 100644 index 53cb44c4..00000000 --- a/py/training_variables.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Training variables for XGBoost direction reconstruction.""" - - -def xgb_training_variables(): - """ - Telescope-type training variables for XGB. - - Disp variables with different indexing logic in data preparation. - """ - - return [ - "Disp_T", - "DispXoff_T", - "DispYoff_T", - "DispWoff_T", - "cen_x", - "cen_y", - "cosphi", - "sinphi", - "loss", - "size", - "dist", - "width", - "length", - "asym", - "tgrad_x", - "R_core", - ]