From c518da0ea295be5f102ccff0d6217c0f7fd67cef Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Sat, 20 Dec 2025 16:41:22 +0100
Subject: [PATCH] Remove XGB stuff moved to Eventdisplay-ML

---
 environment.yml                           |  29 --
 py/applyXGBoostforDirection.py            | 492 ----------------------
 py/tests/test_applyXGBoostforDirection.py | 100 -----
 py/trainXGBoostforDirection.py            | 437 -------------------
 py/training_variables.py                  |  28 --
 5 files changed, 1086 deletions(-)
 delete mode 100644 environment.yml
 delete mode 100644 py/applyXGBoostforDirection.py
 delete mode 100644 py/tests/test_applyXGBoostforDirection.py
 delete mode 100644 py/trainXGBoostforDirection.py
 delete mode 100644 py/training_variables.py

diff --git a/environment.yml b/environment.yml
deleted file mode 100644
index 9705ffd5..00000000
--- a/environment.yml
+++ /dev/null
@@ -1,29 +0,0 @@
----
-name: eventdisplay_v4
-channels:
-  - conda-forge
-dependencies:
-  - python=3.13
-  - awkward
-  - awkward-pandas
-  - joblib
-  - matplotlib
-  - numpy
-  - pandas
-  - pre-commit
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - scikit-learn
-  - scipy
-  - shellcheck
-  - tabulate
-  - towncrier
-  - uproot
-  - xgboost
-
-# cheatsheet
-# create: conda env create -f environment.yml
-# activate: conda activate eventdisplay_v4
-# update (conda/mamba): conda env update -f environment.yml --prune
-# update (micromamba): micromamba update -f environment.yml
diff --git a/py/applyXGBoostforDirection.py b/py/applyXGBoostforDirection.py
deleted file mode 100644
index b65c8c2c..00000000
--- a/py/applyXGBoostforDirection.py
+++ /dev/null
@@ -1,492 +0,0 @@
-"""
-Evaluate XGBoost BDTs for Direction reconstruction
-
-This script applies trained XGBoost models to predict Xoff and Yoff
-for each event in an input ROOT file. The output ROOT file contains
-one row per input event, maintaining the original event order.
-"""
-
-import argparse
-import logging
-import os
-import time
-
-import joblib
-import numpy as np
-import pandas as pd
-import uproot
-from training_variables import xgb_training_variables
-
-logging.basicConfig(level=logging.INFO)
-_logger = logging.getLogger("applyXGBoostforDirection")
-
-
-def get_branch_list():
-    """Branches required for inference."""
-    return [
-        "DispNImages",
-        "DispTelList_T",
-        "Xoff",
-        "Yoff",
-        "Xoff_intersect",
-        "Yoff_intersect",
-        "fpointing_dx",
-        "fpointing_dy",
-    ] + [var for var in xgb_training_variables()]
-
-
-def parse_image_selection(image_selection_str):
-    """
-    Parse the image_selection parameter.
-
-    Can be either:
-    - Bit-coded value (e.g., 14 = 0b1110 = telescopes 1,2,3)
-    - Comma-separated indices (e.g., "1,2,3")
-
-    Returns a list of telescope indices.
-    """
-    if not image_selection_str:
-        return None
-
-    # Parse as comma-separated indices
-    if "," in image_selection_str:
-        try:
-            indices = [int(x.strip()) for x in image_selection_str.split(",")]
-            _logger.info(f"Image selection indices: {indices}")
-            return indices
-        except ValueError:
-            pass
-
-    # Parse as bit-coded value
-    try:
-        bit_value = int(image_selection_str)
-        # Extract bit positions (0-indexed)
-        indices = [i for i in range(4) if (bit_value >> i) & 1]
-        _logger.info(f"Image selection from bit-coded value {bit_value}: {indices}")
-        return indices
-    except ValueError:
-        raise ValueError(
-            f"Invalid image_selection format: {image_selection_str}. "
-            "Use bit-coded value (e.g., 14) or comma-separated indices (e.g., '1,2,3')"
-        )
-
-
-def apply_image_selection(df, selected_indices):
-    """Filter and pad telescope lists for selected indices."""
-    if selected_indices is None or len(selected_indices) == 4:
-        return df
-
-    selected_set = set(selected_indices)
-
-    def calculate_intersection(tel_list):
-        return [tel_idx for tel_idx in tel_list if tel_idx in selected_set]
-
-    df = df.copy()
-    df["DispTelList_T_new"] = df["DispTelList_T"].apply(calculate_intersection)
-    df["DispNImages_new"] = df["DispTelList_T_new"].apply(len)
-
-    _logger.info(
-        f"\n{df[['DispNImages', 'DispTelList_T', 'DispNImages_new', 'DispTelList_T_new']].head(20).to_string()}"
-    )
-
-    df["DispTelList_T"] = df["DispTelList_T_new"]
-    df["DispNImages"] = df["DispNImages_new"]
-    df = df.drop(columns=["DispTelList_T_new", "DispNImages_new"])
-
-    for var_name in xgb_training_variables():
-        if var_name in df.columns:
-            df[var_name] = df[var_name].apply(
-                lambda x: np.pad(
-                    np.array(x),
-                    (0, 4 - len(x)),
-                    mode="constant",
-                    constant_values=np.nan,
-                )
-                if isinstance(x, (list, np.ndarray))
-                else x
-            )
-
-    return df
-
-
-def filter_by_telescope_selection(df, selected_indices):
-    """Return a boolean mask: True when all selected telescopes are present or event has 4 images."""
-    if not selected_indices:
-        return pd.Series([True] * len(df), index=df.index)
-
-    selected_set = set(selected_indices)
-
-    def ok(tel_list):
-        if len(tel_list) >= 4:
-            return True
-        return selected_set.issubset(set(tel_list))
-
-    return df["DispTelList_T"].apply(ok)
-
-
-def flatten_data_vectorized(df, n_tel, training_variables):
-    """
-    Vectorized flattening of telescope array columns.
-    Significantly faster than row-by-row iteration.
-    """
-    flat_features = {}
-    tel_list_col = "DispTelList_T"
-
-    def to_padded_array(arrays):
-        """Convert list of variable-length arrays to fixed-size numpy array, padding with NaN."""
-        max_len = max(len(arr) if hasattr(arr, "__len__") else 1 for arr in arrays)
-        result = np.full((len(arrays), max_len), np.nan)
-        for i, arr in enumerate(arrays):
-            if hasattr(arr, "__len__"):
-                result[i, : len(arr)] = arr
-            else:
-                result[i, 0] = arr
-        return result
-
-    def to_dense_array(col):
-        """
-        Convert a column of variable-length telescope data to a dense 2D numpy array.
-
-        Handles uproot's awkward-style variable-length arrays from ROOT files
-        by converting to plain Python lists first to avoid per-element iteration overhead.
-        """
-        # Convert to a plain Python list first to avoid per-element iteration overhead
-        # when dealing with uproot's awkward-style variable-length arrays
-        arrays = col.tolist() if hasattr(col, "tolist") else list(col)
-        try:
-            return np.vstack(arrays)
-        except (ValueError, TypeError):
-            return to_padded_array(arrays)
-
-    tel_list_matrix = to_dense_array(df[tel_list_col])
-
-    for var_name in training_variables:
-        # Convert the column of arrays to a 2D numpy matrix
-        # Shape: (n_events, max_n_tel)
-        data_matrix = to_dense_array(df[var_name])
-
-        for i in range(n_tel):
-            col_name = f"{var_name}_{i}"
-
-            if var_name.startswith("Disp"):
-                # Case 1: Simple index i
-                if i < data_matrix.shape[1]:
-                    flat_features[col_name] = data_matrix[:, i]
-                else:
-                    flat_features[col_name] = np.full(len(df), np.nan)
-            else:
-                # Case 2: Index lookup via DispTelList_T
-                target_tel_indices = tel_list_matrix[:, i].astype(int)
-
-                row_indices = np.arange(len(df))
-                valid_mask = (target_tel_indices >= 0) & (
-                    target_tel_indices < data_matrix.shape[1]
-                )
-                result = np.full(len(df), np.nan)
-                result[valid_mask] = data_matrix[
-                    row_indices[valid_mask], target_tel_indices[valid_mask]
-                ]
-
-                flat_features[col_name] = result
-
-    # Convert dictionary to DataFrame once at the end
-    df_flat = pd.DataFrame(flat_features, index=df.index)
-
-    # Ensure all columns are float type (uproot may load as awkward arrays)
-    for col in df_flat.columns:
-        df_flat[col] = df_flat[col].astype(np.float32)
-
-    for i in range(n_tel):
-        df_flat[f"disp_x_{i}"] = df_flat[f"Disp_T_{i}"] * df_flat[f"cosphi_{i}"]
-        df_flat[f"disp_y_{i}"] = df_flat[f"Disp_T_{i}"] * df_flat[f"sinphi_{i}"]
-        df_flat[f"loss_loss_{i}"] = df_flat[f"loss_{i}"] ** 2
-        df_flat[f"loss_dist{i}"] = df_flat[f"loss_{i}"] * df_flat[f"dist_{i}"]
-        df_flat[f"width_length_{i}"] = df_flat[f"width_{i}"] / (
-            df_flat[f"length_{i}"] + 1e-6
-        )
-        df_flat[f"size_{i}"] = np.log10(df_flat[f"size_{i}"] + 1e-6)  # avoid log(0)
-        # pointing corrections
-        df_flat[f"cen_x_{i}"] = df_flat[f"cen_x_{i}"] + df_flat[f"fpointing_dx_{i}"]
-        df_flat[f"cen_y_{i}"] = df_flat[f"cen_y_{i}"] + df_flat[f"fpointing_dy_{i}"]
-
-    df_flat["Xoff_weighted_bdt"] = df["Xoff"].astype(np.float32)
-    df_flat["Yoff_weighted_bdt"] = df["Yoff"].astype(np.float32)
-    df_flat["Xoff_intersect"] = df["Xoff_intersect"].astype(np.float32)
-    df_flat["Yoff_intersect"] = df["Yoff_intersect"].astype(np.float32)
-    df_flat["Diff_Xoff"] = (df["Xoff"] - df["Xoff_intersect"]).astype(np.float32)
-    df_flat["Diff_Yoff"] = (df["Yoff"] - df["Yoff_intersect"]).astype(np.float32)
-
-    return df_flat
-
-
-def load_models(model_dir):
-    """
-    Load XGBoost models for different telescope multiplicities from a directory.
-
-    Parameters
-    ----------
-    model_dir : str
-        Path to the directory containing the trained model files
-        named ``dispdir_bdt_ntel{n_tel}_xgboost.joblib``.
-
-    Returns
-    -------
-    dict[int, Any]
-        A dictionary mapping the number of telescopes (n_tel) to the
-        corresponding loaded model objects. Only models whose files
-        exist in ``model_dir`` are included.
-    """
-    models = {}
-    for n_tel in range(2, 5):
-        model_filename = os.path.join(
-            model_dir, f"dispdir_bdt_ntel{n_tel}_xgboost.joblib"
-        )
-        if os.path.exists(model_filename):
-            _logger.info(f"Loading model: {model_filename}")
-            models[n_tel] = joblib.load(model_filename)
-        else:
-            _logger.warning(f"Model not found: {model_filename}")
-    return models
-
-
-def apply_models(df, models_or_dir, selection_mask=None):
-    """
-    Apply trained XGBoost models to a DataFrame chunk.
-
-    Parameters
-    ----------
-    df : pandas.DataFrame
-        Chunk of events to process. Must contain at least the columns used in
-        ``xgb_training_variables()`` plus the pointing information
-        ("fpointing_dx", "fpointing_dy") and "DispNImages".
-    models_or_dir : dict[int, Any] or str
-        Either a preloaded models dictionary (as returned by :func:`load_models`)
-        or a path to a model directory. If a string is provided, models are
-        loaded on the fly to satisfy test expectations.
-    selection_mask : pandas.Series or None
-        Optional mask; False entries are marked with -999 in outputs.
-
-    Returns
-    -------
-    pred_xoff : numpy.ndarray
-        Array of predicted Xoff values for each event in the chunk, aligned
-        with the index of ``df``.
-    pred_yoff : numpy.ndarray
-        Array of predicted Yoff values for each event in the chunk, aligned
-        with the index of ``df``.
-    """
-    n_events = len(df)
-    pred_xoff = np.full(n_events, np.nan, dtype=np.float32)
-    pred_yoff = np.full(n_events, np.nan, dtype=np.float32)
-
-    if isinstance(models_or_dir, str):
-        models = load_models(models_or_dir)
-    else:
-        models = models_or_dir
-
-    grouped = df.groupby("DispNImages")
-
-    for n_tel, group_df in grouped:
-        n_tel = int(n_tel)
-        if int(n_tel) < 2:
-            continue
-        if n_tel not in models:
-            _logger.warning(
-                f"No model available for n_tel={n_tel}, skipping {len(group_df)} events"
-            )
-            continue
-
-        _logger.info(f"Processing {len(group_df)} events with n_tel={n_tel}")
-
-        training_vars_with_pointing = xgb_training_variables() + [
-            "fpointing_dx",
-            "fpointing_dy",
-        ]
-        df_flat = flatten_data_vectorized(group_df, n_tel, training_vars_with_pointing)
-
-        excluded_columns = ["MCxoff", "MCyoff", "MCe0"]
-        for n in range(n_tel):
-            excluded_columns.append(f"fpointing_dx_{n}")
-            excluded_columns.append(f"fpointing_dy_{n}")
-
-        feature_cols = [col for col in df_flat.columns if col not in excluded_columns]
-        X = df_flat[feature_cols]
-
-        model = models[n_tel]
-        predictions = model.predict(X)
-
-        for i, idx in enumerate(group_df.index):
-            pred_xoff[idx] = predictions[i, 0]
-            pred_yoff[idx] = predictions[i, 1]
-
-    if selection_mask is not None:
-        pred_xoff = np.where(selection_mask, pred_xoff, -999.0)
-        pred_yoff = np.where(selection_mask, pred_yoff, -999.0)
-
-    return pred_xoff, pred_yoff
-
-
-def process_file_chunked(
-    input_file,
-    model_dir,
-    output_file,
-    image_selection,
-    max_events=None,
-    chunk_size=500000,
-):
-    """
-    Stream events from an input ROOT file in chunks, apply XGBoost models,
-    and write direction predictions to an output ROOT file.
-
-    Parameters
-    ----------
-    input_file : str
-        Path to the input ROOT file containing a "data" TTree.
-    model_dir : str
-        Directory containing the trained XGBoost model files named
-        ``dispdir_bdt_ntel{n_tel}_xgboost.joblib`` for different telescope
-        multiplicities.
-    output_file : str
-        Path to the output ROOT file to create. The file will contain a
-        "StereoAnalysis" TTree with ``Dir_Xoff`` and ``Dir_Yoff`` branches.
-    image_selection : str
-        String specifying which telescope indices to select, passed to
-        :func:`parse_image_selection` to obtain the corresponding indices
-        used by :func:`apply_image_selection`.
-    max_events : int, optional
-        Maximum number of events to process. If None (default), all
-        available events in the input file are processed.
-    chunk_size : int, optional
-        Number of events to read and process per chunk. Larger values reduce
-        I/O overhead but increase memory usage. Default is 500000.
-
-    Returns
-    -------
-    None
-        This function writes results directly to ``output_file`` and does not
-        return a value.
-    """
-    models = load_models(model_dir)
-    branch_list = get_branch_list()
-    selected_indices = parse_image_selection(image_selection)
-
-    _logger.info(f"Chunk size: {chunk_size}")
-    if max_events:
-        _logger.info(f"Maximum events to process: {max_events}")
-
-    with uproot.recreate(output_file) as root_file:
-        tree = root_file.mktree(
-            "StereoAnalysis",
-            {"Dir_Xoff": np.float32, "Dir_Yoff": np.float32},
-        )
-
-        total_processed = 0
-
-        for df_chunk in uproot.iterate(
-            f"{input_file}:data",
-            branch_list,
-            library="pd",
-            step_size=chunk_size,
-        ):
-            if df_chunk.empty:
-                continue
-
-            df_chunk = apply_image_selection(df_chunk, selected_indices)
-            if df_chunk.empty:
-                continue
-
-            # Stop early if we've reached max_events limit
-            if max_events is not None and total_processed >= max_events:
-                break
-
-            # Reset index to local chunk indices (0, 1, 2, ...) to avoid
-            # index out-of-bounds when indexing chunk-sized output arrays
-            df_chunk = df_chunk.reset_index(drop=True)
-
-            pred_xoff, pred_yoff = apply_models(df_chunk, models)
-
-            tree.extend(
-                {
-                    "Dir_Xoff": np.asarray(pred_xoff, dtype=np.float32),
-                    "Dir_Yoff": np.asarray(pred_yoff, dtype=np.float32),
-                }
-            )
-
-            total_processed += len(df_chunk)
-            _logger.info(f"Processed {total_processed} events so far")
-
-    _logger.info(
-        f"Streaming complete. Total processed events written: {total_processed}"
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description=("Apply XGBoost Multi-Target BDTs for Stereo Reconstruction")
-    )
-    parser.add_argument(
-        "--input-file",
-        required=True,
-        metavar="INPUT.root",
-        help="Path to input mscw ROOT file",
-    )
-    parser.add_argument(
-        "--model-dir",
-        required=True,
-        metavar="MODEL_DIR",
-        help="Directory containing XGBoost models",
-    )
-    parser.add_argument(
-        "--output-file",
-        required=True,
-        metavar="OUTPUT.root",
-        help="Output ROOT file path for predictions",
-    )
-    parser.add_argument(
-        "--image-selection",
-        type=str,
-        default="15",
-        help=(
-            "Optional telescope selection. Can be bit-coded (e.g., 14 for telescopes 1,2,3) "
-            "or comma-separated indices (e.g., '1,2,3'). "
-            "Keeps events with all selected telescopes or 4-telescope events. "
-            "Default is 15 (0b1111), which selects all 4 telescopes."
-        ),
-    )
-    parser.add_argument(
-        "--max-events",
-        type=int,
-        default=None,
-        help="Maximum number of events to process (default: all events)",
-    )
-    parser.add_argument(
-        "--chunk-size",
-        type=int,
-        default=500000,
-        help="Number of events to process per chunk (default: 500000)",
-    )
-    args = parser.parse_args()
-
-    start_time = time.time()
-    _logger.info("--- XGBoost Multi-Target Direction Evaluation ---")
-    _logger.info(f"Input file: {args.input_file}")
-    _logger.info(f"Model directory: {args.model_dir}")
-    _logger.info(f"Output file: {args.output_file}")
-    if args.image_selection:
-        _logger.info(f"Image selection: {args.image_selection}")
-
-    process_file_chunked(
-        input_file=args.input_file,
-        model_dir=args.model_dir,
-        output_file=args.output_file,
-        image_selection=args.image_selection,
-        max_events=args.max_events,
-        chunk_size=args.chunk_size,
-    )
-
-    elapsed_time = time.time() - start_time
-    _logger.info(f"Processing complete. Total time: {elapsed_time:.2f} seconds")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/py/tests/test_applyXGBoostforDirection.py b/py/tests/test_applyXGBoostforDirection.py
deleted file mode 100644
index 96de97e9..00000000
--- a/py/tests/test_applyXGBoostforDirection.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import applyXGBoostforDirection as mod
-import numpy as np
-import pandas as pd
-
-
-def test_parse_image_selection_indices():
-    indices = mod.parse_image_selection("1,2,3")
-    assert indices == [1, 2, 3]
-
-
-def test_parse_image_selection_bits():
-    indices = mod.parse_image_selection("14")  # 0b1110 -> [1,2,3]
-    assert indices == [1, 2, 3]
-
-
-def test_filter_by_telescope_selection_returns_mask_and_preserves_length():
-    df = pd.DataFrame(
-        {
-            "DispTelList_T": [
-                [1, 2, 3],  # has 1,2,3
-                [1, 3],  # missing 2
-                [0, 1, 2, 3],  # 4 telescopes -> always included
-                [0, 2],  # missing 1,3
-            ]
-        }
-    )
-    selected = [1, 2, 3]
-    mask = mod.filter_by_telescope_selection(df, selected)
-    assert isinstance(mask, pd.Series)
-    assert len(mask) == len(df)
-    # Expect True for rows 0 and 2
-    assert mask.tolist() == [True, False, True, False]
-
-
-class DummyModel:
-    def __init__(self, out_val=(0.0, 0.0)):
-        self.out_val = np.array(out_val, dtype=float)
-
-    def predict(self, X):
-        # Return shape (n_rows, 2) filled with out_val
-        n = len(X)
-        return np.tile(self.out_val, (n, 1))
-
-
-def test_apply_models_with_selection_mask(monkeypatch):
-    # Build a minimal DataFrame with required columns
-    df = pd.DataFrame(
-        {
-            "DispNImages": [3, 3, 4, 2, 3],
-            "DispTelList_T": [
-                [1, 2, 3],  # selected
-                [1, 3],  # unselected
-                [0, 1, 2, 3],  # 4 telescopes -> selected
-                [0, 1],  # unselected
-                [1, 2, 3],  # selected
-            ],
-            # Truth-related columns used downstream; values not relevant for this test
-            "Xoff": [0, 0, 0, 0, 0],
-            "Yoff": [0, 0, 0, 0, 0],
-            "Xoff_intersect": [0, 0, 0, 0, 0],
-            "Yoff_intersect": [0, 0, 0, 0, 0],
-        }
-    )
-
-    # Selection: require telescopes 1,2,3 or len==4
-    selection_mask = mod.filter_by_telescope_selection(df, [1, 2, 3])
-
-    # Monkeypatch model loading and existence checks
-    monkeypatch.setattr(mod.os.path, "exists", lambda p: True)
-
-    # Always return a DummyModel; value differentiates by n_tel if desired
-    def fake_load(path):
-        if "ntel4" in path:
-            return DummyModel(out_val=(4.0, -4.0))
-        elif "ntel3" in path:
-            return DummyModel(out_val=(3.0, -3.0))
-        else:
-            return DummyModel(out_val=(2.0, -2.0))
-
-    monkeypatch.setattr(mod.joblib, "load", fake_load)
-
-    # Monkeypatch flattening to avoid complex array inputs
-    def fake_flatten(group_df, n_tel, training_vars):
-        # Provide a simple feature column not in excluded list
-        return pd.DataFrame({"feature": np.zeros(len(group_df))}, index=group_df.index)
-
-    monkeypatch.setattr(mod, "flatten_data_vectorized", fake_flatten)
-
-    pred_x, pred_y = mod.apply_models(df, "dummy_models_dir", selection_mask)
-
-    # Output length must match input length
-    assert len(pred_x) == len(df)
-    assert len(pred_y) == len(df)
-
-    # Unselected rows must be -999; selected rows come from model values
-    # From selection_mask above: rows 0,2,4 are selected; 1,3 are unselected
-    expected_x = [3.0, -999.0, 4.0, -999.0, 3.0]
-    expected_y = [-3.0, -999.0, -4.0, -999.0, -3.0]
-    assert np.allclose(pred_x, expected_x, equal_nan=False)
-    assert np.allclose(pred_y, expected_y, equal_nan=False)
diff --git a/py/trainXGBoostforDirection.py b/py/trainXGBoostforDirection.py
deleted file mode 100644
index a2d7c317..00000000
--- a/py/trainXGBoostforDirection.py
+++ /dev/null
@@ -1,437 +0,0 @@
-"""
-Train XGBoost Multi-Target BDTs for Direction Reconstruction
-=================================================
-
-Uses x,y offsets calculated from intersection and dispBDT methods plus
-image parameters to train multi-target regression BDTs to predict x,y offsets.
-
-Separate BDTs are trained for 2, 3, and 4 telescope multiplicity events.
-
-"""
-
-import argparse
-import logging
-import os
-from typing import Dict, List
-
-import numpy as np
-import pandas as pd
-import uproot
-import xgboost as xgb
-from joblib import dump
-from sklearn.metrics import mean_absolute_error, mean_squared_error
-from sklearn.model_selection import train_test_split
-from sklearn.multioutput import MultiOutputRegressor
-from training_variables import xgb_training_variables
-
-logging.basicConfig(level=logging.INFO)
-_logger = logging.getLogger("trainXGBoostforDirection")
-
-
-def load_and_flatten_data(input_files, n_tel, max_events, training_step=True):
-    """
-    Reads the data from ROOT files, filters for the required multiplicity (n_tel),
-    and flattens the telescope-array features into a Pandas DataFrame.
-    """
-    _logger.info(f"\n--- Loading and Flattening Data for n_tel = {n_tel} ---")
-    _logger.info(
-        f"Max events to process: {max_events if max_events > 0 else 'All available'}"
-    )
-
-    branch_list = [
-        "DispNImages",
-        "DispTelList_T",
-        "Xoff",
-        "Yoff",
-        "Xoff_intersect",
-        "Yoff_intersect",
-        "MCxoff",
-        "MCyoff",
-        "MCe0",
-    ] + [var for var in xgb_training_variables()]
-
-    dfs = []
-    if max_events > 0:
-        max_events_per_file = max_events // len(input_files)
-    else:
-        max_events_per_file = None
-
-    for f in input_files:
-        try:
-            with uproot.open(f) as root_file:
-                if "data" in root_file:
-                    _logger.info(f"Processing file: {f}")
-                    tree = root_file["data"]
-                    df = tree.arrays(branch_list, library="pd")
-                    df = df[df["DispNImages"] == n_tel]
-                    _logger.info(f"Number of events after n_tel filter: {len(df)}")
-                    if max_events_per_file and len(df) > max_events_per_file:
-                        df = df.sample(n=max_events_per_file, random_state=42)
-                    if not df.empty:
-                        dfs.append(df)
-                else:
-                    _logger.warning(f"File: {f} does not contain a 'data' tree.")
-        except Exception as e:
-            _logger.error(f"Error opening or reading file {f}: {e}")
-
-    if len(dfs) == 0:
-        _logger.error("No data loaded from input files.")
-        return pd.DataFrame()
-
-    data_tree = pd.concat(dfs, ignore_index=True)
-    _logger.info(f"Total events for n_tel={n_tel}: {len(data_tree)}")
-
-    if data_tree.empty:
-        return pd.DataFrame()
-
-    # Compute weights:
-    # - R (to reflect physical sky area)
-    # - E (to balance energy distribution)
-    sample_weights = (
-        np.sqrt(
-            data_tree["MCxoff"] ** 2 + data_tree["MCyoff"] ** 2
-        )  # * data_tree["MCe0"]
-    )
-
-    df_flat = flatten_data_vectorized(data_tree, n_tel, xgb_training_variables())
-
-    if training_step:
-        df_flat["MCxoff"] = data_tree["MCxoff"]
-        df_flat["MCyoff"] = data_tree["MCyoff"]
-        df_flat["MCe0"] = np.log10(data_tree["MCe0"])
-        df_flat["sample_weight"] = sample_weights
-
-    df_flat.dropna(inplace=True)
-    _logger.info(f"Final events for n_tel={n_tel} after cleanup: {len(df_flat)}")
-
-    return df_flat
-
-
-def flatten_data_vectorized(df, n_tel, training_variables):
-    """Vectorized flattening of telescope array columns."""
-    flat_features = {}
-
-    try:
-        tel_list_matrix = np.vstack(df["DispTelList_T"].values)
-    except ValueError:
-        tel_list_matrix = np.array(df["DispTelList_T"].tolist())
-
-    for var_name in training_variables:
-        # Convert the column of arrays to a 2D numpy matrix
-        # Shape: (n_events, max_n_tel)
-        try:
-            data_matrix = np.vstack(df[var_name].values)
-        except ValueError:
-            data_matrix = np.array(df[var_name].tolist())
-
-        for i in range(n_tel):
-            col_name = f"{var_name}_{i}"
-
-            if var_name.startswith("Disp"):
-                # Case 1: Simple index i
-                if i < data_matrix.shape[1]:
-                    flat_features[col_name] = data_matrix[:, i]
-                else:
-                    flat_features[col_name] = np.full(len(df), np.nan)
-            else:
-                # Case 2: Index lookup via DispTelList_T
-                target_tel_indices = tel_list_matrix[:, i].astype(int)
-
-                row_indices = np.arange(len(df))
-                valid_mask = (target_tel_indices >= 0) & (
-                    target_tel_indices < data_matrix.shape[1]
-                )
-                result = np.full(len(df), np.nan)
-                result[valid_mask] = data_matrix[
-                    row_indices[valid_mask], target_tel_indices[valid_mask]
-                ]
-
-                flat_features[col_name] = result
-
-    # Convert dictionary to DataFrame once at the end
-    df_flat = pd.DataFrame(flat_features, index=df.index)
-
-    # Additional derived features to support finding certain event classes
-    for i in range(n_tel):
-        df_flat[f"disp_x_{i}"] = df_flat[f"Disp_T_{i}"] * df_flat[f"cosphi_{i}"]
-        df_flat[f"disp_y_{i}"] = df_flat[f"Disp_T_{i}"] * df_flat[f"sinphi_{i}"]
-        df_flat[f"loss_loss_{i}"] = df_flat[f"loss_{i}"] ** 2
-        df_flat[f"loss_dist{i}"] = df_flat[f"loss_{i}"] * df_flat[f"dist_{i}"]
-        df_flat[f"width_length_{i}"] = df_flat[f"width_{i}"] / (
-            df_flat[f"length_{i}"] + 1e-6
-        )
-        df_flat[f"size_{i}"] = np.log10(df_flat[f"size_{i}"] + 1e-6)  # avoid log(0)
-
-    df_flat["Xoff_weighted_bdt"] = df["Xoff"]
-    df_flat["Yoff_weighted_bdt"] = df["Yoff"]
-    df_flat["Xoff_intersect"] = df["Xoff_intersect"]
-    df_flat["Yoff_intersect"] = df["Yoff_intersect"]
-    df_flat["Diff_Xoff"] = df["Xoff"] - df["Xoff_intersect"]
-    df_flat["Diff_Yoff"] = df["Yoff"] - df["Yoff_intersect"]
-
-    return df_flat
-
-
-def train(df, n_tel, output_dir, train_test_fraction):
-    """
-    Trains a single XGBoost model for multi-target regression (Xoff, Yoff).
-
-    Parameters:
-    - df: Pandas DataFrame with training data.
-    - n_tel: Telescope multiplicity.
-    - output_dir: Directory to save the trained model.
-    - train_test_fraction: Fraction of data to use for training.
-    """
-    if df.empty:
-        _logger.warning(f"Skipping training for n_tel={n_tel} due to empty data.")
-        return
-
-    # Features (X) and targets (Y)
-    X_cols = [
-        col
-        for col in df.columns
-        if col not in ["MCxoff", "MCyoff", "MCe0", "sample_weight"]
-    ]
-    X = df[X_cols]
-    Y = df[["MCxoff", "MCyoff"]]
-
-    _logger.info(f"Training variables ({len(X_cols)}): {X_cols}")
-
-    X_train, X_test, Y_train, Y_test, W_train, _ = train_test_split(
-        X,
-        Y,
-        df["sample_weight"],
-        test_size=1.0 - train_test_fraction,
-        random_state=None,
-    )
-
-    _logger.info(
-        f"n_tel={n_tel}: Training events: {len(X_train)}, Testing events: {len(X_test)}"
-    )
-
-    # Parse TMVA options (simplified mapping to XGBoost parameters)
-    # The default TMVA string is:
-    # !V:NTrees=800:BoostType=Grad:Shrinkage=0.1:MaxDepth=4:MinNodeSize=1.0%
-
-    # Note: XGBoost defaults to gbtree (Gradient Boosting).
-    # MultiOutputRegressor requires a base estimator (e.g., plain XGBRegressor)
-    xgb_params = {
-        "n_estimators": 1000,
-        "learning_rate": 0.1,  # Shrinkage
-        "max_depth": 5,
-        "min_child_weight": 1.0,  # Equivalent to MinNodeSize=1.0% for XGBoost
-        "objective": "reg:squarederror",
-        # https://xgboosting.com/configure-xgboost-regpseudohubererror-objective/
-        # "objective": "reg:pseudohubererror",
-        "n_jobs": 4,
-        "random_state": None,
-        "tree_method": "hist",
-        "subsample": 0.7,  # Default sensible value
-        "colsample_bytree": 0.7,  # Default sensible value
-    }
-    rf_params = {
-        "n_estimators": 1000,
-        "max_depth": None,
-        "max_features": "sqrt",
-        "min_samples_leaf": 5,
-        "n_jobs": 4,
-        "random_state": None,
-    }
-    # Configure models
-    # - xgboost default approach
-    configs = {
-        "xgboost": xgb.XGBRegressor(**xgb_params),
-        # "random_forest": RandomForestRegressor(**rf_params),
-    }
-    _logger.info(
-        f"Sample weights (not(!) used in training) - min: {W_train.min():.6f}, "
-        f"max: {W_train.max():.6f}, mean: {W_train.mean():.6f}"
-    )
-
-    for name, estimator in configs.items():
-        _logger.info(f"Training with {name} for n_tel={n_tel}...")
-        _logger.info(f"parameters: {xgb_params if name == 'xgboost' else rf_params}")
-        model = MultiOutputRegressor(estimator)
-        model.fit(X_train, Y_train)
-
-        output_filename = os.path.join(
-            output_dir, f"dispdir_bdt_ntel{n_tel}_{name}.joblib"
-        )
-        dump(model, output_filename)
-        _logger.info(f"{name} model saved to: {output_filename}")
-
-        evaluate_model(model, X_test, Y_test, df, X_cols, Y, name)
-
-
-def evaluate_model(model, X_test, Y_test, df, X_cols, Y, name):
-    """
-    Evaluates the trained model on the test set and prints performance metrics.
-
-    """
-    score = model.score(X_test, Y_test)
-    _logger.info(f"XGBoost Multi-Target R^2 Score (Testing Set): {score:.4f}")
-    Y_pred = model.predict(X_test)
-    mse_x = mean_squared_error(Y_test["MCxoff"], Y_pred[:, 0])
-    mse_y = mean_squared_error(Y_test["MCyoff"], Y_pred[:, 1])
-    _logger.info(f"{name} MSE (X_off): {mse_x:.4f}, MSE (Y_off): {mse_y:.4f}")
-    mae_x = mean_absolute_error(Y_test["MCxoff"], Y_pred[:, 0])
-    mae_y = mean_absolute_error(Y_test["MCyoff"], Y_pred[:, 1])
-    _logger.info(f"{name} MAE (X_off): {mae_x:.4f}, MAE (Y_off): {mae_y:.4f}")
-
-    feature_importance(model, X_cols, Y.columns, name)
-    if name == "xgboost":
-        shap_feature_importance(model, X_test, Y.columns)
-
-    angular_resolution(
-        Y_pred,
-        Y_test,
-        df,
-        percentiles=[68, 90, 95],
-        log_e_min=-1,
-        log_e_max=2,
-        n_bins=6,
-        name=name,
-    )
-
-
-def angular_resolution(
-    Y_pred, Y_test, df, percentiles, log_e_min, log_e_max, n_bins, name
-):
-    """
-    Computes and logs the angular resolution based on predicted and true offsets.
-    """
-    results_df = pd.DataFrame(
-        {
-            "MCxoff_true": Y_test["MCxoff"].values,
-            "MCyoff_true": Y_test["MCyoff"].values,
-            "MCxoff_pred": Y_pred[:, 0],
-            "MCyoff_pred": Y_pred[:, 1],
-            "MCe0": df.loc[Y_test.index, "MCe0"].values,
-        }
-    )
-
-    results_df["DeltaTheta"] = np.sqrt(
-        (results_df["MCxoff_true"] - results_df["MCxoff_pred"]) ** 2
-        + (results_df["MCyoff_true"] - results_df["MCyoff_pred"]) ** 2
-    )
-
-    results_df["LogE"] = results_df["MCe0"]
-    bins = np.linspace(log_e_min, log_e_max, n_bins + 1)
-    results_df["E_bin"] = pd.cut(results_df["LogE"], bins=bins, include_lowest=True)
-    results_df.dropna(subset=["E_bin"], inplace=True)
-
-    resolution_results: Dict[str, List[float]] = {}
-    mean_logE_by_bin = (
-        results_df.groupby("E_bin", observed=False)["LogE"].mean().round(3)
-    )
-
-    # Group and calculate the specified percentile for DeltaTheta
-    for p in percentiles:
-        p_res = results_df.groupby("E_bin", observed=False)["DeltaTheta"].agg(
-            lambda x: np.percentile(x, p) if len(x) > 0 else np.nan
-        )
-        resolution_results[f"Theta_{p}%"] = p_res.values
-
-    output_df = pd.DataFrame(resolution_results, index=mean_logE_by_bin.index)
-    output_df.insert(0, "Mean Log10(E)", mean_logE_by_bin.values)
-    output_df.index.name = "Log10(E) Bin Range"
-    output_df = output_df.dropna()  # Drop bins with no events
-
-    _logger.info(f"\n--- {name} Angular Resolution vs. Log10(MCe0) ---")
-    _logger.info(
-        f"Calculated over {n_bins} bins between Log10(E) = {log_e_min} and {log_e_max}"
-    )
-    _logger.info("\n" + output_df.to_markdown(floatfmt=".4f"))
-
-
-def feature_importance(model, X_cols, target_names, name=None):
-    """
-    Prints feature importance from the trained XGBoost model.
-    """
-    _logger.info("--- XGBoost Multi-Regression Feature Importance ---")
-    for i, estimator in enumerate(model.estimators_):
-        target = target_names[i]
-        _logger.info(f"\n### {name} Importance for Target: **{target}**")
-
-        importances = estimator.feature_importances_
-        importance_df = pd.DataFrame({"Feature": X_cols, "Importance": importances})
-
-        importance_df = importance_df.sort_values(by="Importance", ascending=False)
-        _logger.info("\n" + importance_df.head(15).to_markdown(index=False))
-
-
-def shap_feature_importance(model, X, target_names, max_points=20000, n_top=25):
-    """
-    Uses XGBoost's builtin SHAP (pred_contribs=True).
-    Avoids SHAP.TreeExplainer compatibility issues with XGBoost ≥1.7.
-    """
-
-    # Subsample
-    X_sample = X.sample(n=min(len(X), max_points), random_state=0)
-
-    # Loop through estimators (one per target)
-    for i, est in enumerate(model.estimators_):
-        target = target_names[i]
-
-        # Builtin XGBoost SHAP values (n_samples, n_features+1)
-        # Last column is the bias term → drop it
-        shap_vals = est.get_booster().predict(xgb.DMatrix(X_sample), pred_contribs=True)
-        shap_vals = shap_vals[:, :-1]  # drop bias column
-
-        # Global importance: mean(|SHAP|)
-        imp = np.abs(shap_vals).mean(axis=0)
-        idx = np.argsort(imp)[::-1]
-
-        _logger.info(f"\n=== Builtin XGBoost SHAP Importance for {target} ===")
-        for j in idx[:n_top]:
-            _logger.info(f"{X.columns[j]:25s}  {imp[j]:.6e}")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description=("Train XGBoost Multi-Target BDTs for Direction Reconstruction")
-    )
-    parser.add_argument("--input_file_list", help="List of input mscw ROOT files.")
-    parser.add_argument("--ntel", type=int, help="Telescope multiplicity (2, 3, or 4).")
-    parser.add_argument(
-        "--output_dir", help="Output directory for XGBoost models and weights."
-    )
-    parser.add_argument(
-        "--train_test_fraction",
-        type=float,
-        help="Fraction of data for training (e.g., 0.5).",
-    )
-    parser.add_argument(
-        "--max_events",
-        type=int,
-        help="Maximum number of events to process across all files.",
-    )
-
-    args = parser.parse_args()
-
-    try:
-        with open(args.input_file_list, "r") as f:
-            input_files = [line.strip() for line in f if line.strip()]
-    except FileNotFoundError as exc:
-        raise FileNotFoundError(
-            f"Error: Input file list not found: {args.input_file_list}"
-        ) from exc
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    _logger.info("--- XGBoost Multi-Target Direction Training ---")
-    _logger.info(f"Input files: {len(input_files)}")
-    _logger.info(f"Telescope multiplicity: {args.ntel}")
-    _logger.info(f"Output directory: {args.output_dir}")
-    _logger.info(
-        f"Train vs test fraction: {args.train_test_fraction}, Max events: {args.max_events}"
-    )
-
-    df_flat = load_and_flatten_data(input_files, args.ntel, args.max_events)
-    train(df_flat, args.ntel, args.output_dir, args.train_test_fraction)
-    _logger.info("\nXGBoost model trained successfully.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/py/training_variables.py b/py/training_variables.py
deleted file mode 100644
index 53cb44c4..00000000
--- a/py/training_variables.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""Training variables for XGBoost direction reconstruction."""
-
-
-def xgb_training_variables():
-    """
-    Telescope-type training variables for XGB.
-
-    Disp variables with different indexing logic in data preparation.
-    """
-
-    return [
-        "Disp_T",
-        "DispXoff_T",
-        "DispYoff_T",
-        "DispWoff_T",
-        "cen_x",
-        "cen_y",
-        "cosphi",
-        "sinphi",
-        "loss",
-        "size",
-        "dist",
-        "width",
-        "length",
-        "asym",
-        "tgrad_x",
-        "R_core",
-    ]