Add img-angres as feature (#51)

GernotMaier · web-flow · commit 27dbfc482f8f · 2026-02-19T11:10:55.000+01:00
* Add img-angres as feature
* add multiplicity cut
diff --git a/docs/changes/51.feature.md b/docs/changes/51.feature.md
@@ -0,0 +1,3 @@
+Improve stereo reconstruction by adding the geometrical feature img2_ang.
+Change clipping min for size to '1' (applicable for small images in SSTs).
+Add preview_rows as command line parameter to allow flexible printout for debugging.
diff --git a/src/eventdisplay_ml/config.py b/src/eventdisplay_ml/config.py
@@ -91,6 +91,21 @@ def configure_training(analysis_type):
         help="Maximum number of telescopes to keep per mirror area type (for feature reduction).",
         default=None,
     )
+    parser.add_argument(
+        "--preview_rows",
+        type=int,
+        help=(
+            "Number of events to include in the sorted telescope preview log (set to 0 to disable)."
+        ),
+        default=20,
+    )
+    if analysis_type == "stereo_analysis":
+        parser.add_argument(
+            "--min_images",
+            type=int,
+            help="Minimum number of images (DispNImages) for quality cut (default: 2).",
+            default=2,
+        )
 
     model_configs = vars(parser.parse_args())
 
@@ -102,8 +117,11 @@ def configure_training(analysis_type):
     _logger.info(f"Random state: {model_configs['random_state']}")
     _logger.info(f"Max events: {model_configs['max_events']}")
     _logger.info(f"Max CPU cores: {model_configs['max_cores']}")
+    _logger.info(f"Preview rows: {model_configs['preview_rows']}")
     if model_configs.get("max_tel_per_type") is not None:
         _logger.info(f"Max telescopes per mirror area type: {model_configs['max_tel_per_type']}")
+    if analysis_type == "stereo_analysis":
+        _logger.info(f"Minimum images (DispNImages): {model_configs.get('min_images')}")
 
     model_configs["models"] = hyper_parameters(
         analysis_type, model_configs.get("hyperparameter_config")
@@ -112,7 +130,9 @@ def configure_training(analysis_type):
     model_configs["targets"] = target_features(analysis_type)
 
     if analysis_type == "stereo_analysis":
-        model_configs["pre_cuts"] = pre_cuts_regression(model_configs.get("n_tel"))
+        model_configs["pre_cuts"] = pre_cuts_regression(
+            model_configs.get("n_tel"), min_images=model_configs.get("min_images", 2)
+        )
     elif analysis_type == "classification":
         _logger.info(f"Energy bin {model_configs['energy_bin_number']}")
         model_parameters = utils.load_model_parameters(
@@ -193,6 +213,14 @@ def configure_apply(analysis_type):
         help="Observatory/site name for geomagnetic field (default: VERITAS).",
         default="VERITAS",
     )
+    parser.add_argument(
+        "--preview_rows",
+        type=int,
+        help=(
+            "Number of events to include in the sorted telescope preview log (set to 0 to disable)."
+        ),
+        default=20,
+    )
 
     model_configs = vars(parser.parse_args())
 
@@ -204,6 +232,7 @@ def configure_apply(analysis_type):
     _logger.info(f"Image selection: {model_configs.get('image_selection')}")
     _logger.info(f"Max events: {model_configs.get('max_events')}")
     _logger.info(f"Max cores: {model_configs.get('max_cores')}")
+    _logger.info(f"Preview rows: {model_configs['preview_rows']}")
 
     model_configs["models"], par = load_models(
         analysis_type, model_configs["model_prefix"], model_configs["model_name"]
diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py
@@ -426,6 +426,7 @@ def flatten_telescope_data_vectorized(
     tel_config=None,
     observatory="veritas",
     max_tel_per_type=None,
+    preview_rows=20,
 ):
     """
     Vectorized flattening of telescope array columns.
@@ -451,6 +452,8 @@ def flatten_telescope_data_vectorized(
         Observatory name for indexing mode detection. Default is "veritas".
     max_tel_per_type : int, optional
         Maximum number of telescopes to keep per mirror area type. If None, keep all.
+    preview_rows : int, optional
+        Number of events to include in the sorting preview log. Set to 0 to disable.
 
     Returns
     -------
@@ -563,7 +566,14 @@ def flatten_telescope_data_vectorized(
         flat_features = filtered_features
 
     index = _get_index(df, n_evt)
-    df_flat = flatten_telescope_variables(n_tel, flat_features, index, tel_config, analysis_type)
+    df_flat = flatten_telescope_variables(
+        n_tel,
+        flat_features,
+        index,
+        tel_config=tel_config,
+        analysis_type=analysis_type,
+        preview_rows=preview_rows,
+    )
     return pd.concat(
         [df_flat, extra_columns(df, analysis_type, training, index, tel_config, observatory)],
         axis=1,
@@ -706,6 +716,7 @@ def flatten_feature_data(
     tel_config=None,
     observatory="veritas",
     max_tel_per_type=None,
+    preview_rows=20,
 ):
     """
     Get flattened features for events.
@@ -728,6 +739,8 @@ def flatten_feature_data(
         Observatory name for indexing mode detection.
     max_tel_per_type : int, optional
         Maximum number of telescopes to keep per mirror area type. If None, keep all.
+    preview_rows : int, optional
+        Number of events to include in the sorting preview log. Set to 0 to disable.
     """
     df_flat = flatten_telescope_data_vectorized(
         group_df,
@@ -738,6 +751,7 @@ def flatten_feature_data(
         tel_config=tel_config,
         observatory=observatory,
         max_tel_per_type=max_tel_per_type,
+        preview_rows=preview_rows,
     )
     max_tel_id = tel_config["max_tel_id"] if tel_config else ntel - 1
     excluded_columns = set(features_module.target_features(analysis_type)) | set(
@@ -855,6 +869,7 @@ def load_training_data(model_configs, file_list, analysis_type):
                     tel_config=tel_config,
                     observatory=model_configs.get("observatory", "veritas"),
                     max_tel_per_type=model_configs.get("max_tel_per_type", None),
+                    preview_rows=model_configs.get("preview_rows", 20),
                 )
                 if analysis_type == "stereo_analysis":
                     new_cols = {
@@ -941,7 +956,14 @@ def apply_clip_intervals(df, n_tel=None, apply_log10=None):
                     df.loc[mask_to_log, var_base] = np.log10(df.loc[mask_to_log, var_base])
 
 
-def flatten_telescope_variables(n_tel, flat_features, index, tel_config=None, analysis_type=None):
+def flatten_telescope_variables(
+    n_tel,
+    flat_features,
+    index,
+    tel_config=None,
+    analysis_type=None,
+    preview_rows=20,
+):
     """Generate dataframe for telescope variables flattened for all telescopes.
 
     Creates features for all telescope IDs, using NaN as default value for missing data.
@@ -958,6 +980,8 @@ def flatten_telescope_variables(n_tel, flat_features, index, tel_config=None, an
         Telescope configuration with 'max_tel_id' key.
     analysis_type : str, optional
         Type of analysis, e.g. "classification" or "stereo_analysis".
+    preview_rows : int, optional
+        Number of events to include in the sorting preview log. Set to 0 to disable.
     """
     df_flat = pd.DataFrame(flat_features, index=index)
     df_flat = df_flat.astype(np.float32)
@@ -988,11 +1012,13 @@ def flatten_telescope_variables(n_tel, flat_features, index, tel_config=None, an
     size_cols = [c for c in df_flat.columns if c.startswith("size_")][: max_tel_id + 1]
     area_cols = [c for c in df_flat.columns if c.startswith("mirror_area_")][: max_tel_id + 1]
     disp_cols = [c for c in df_flat.columns if c.startswith("Disp_T_")][: max_tel_id + 1]
-    preview = df_flat[size_cols + area_cols + disp_cols].head(20)
-    _logger.info(
-        "Sorted telescope sizes (pre-clip/log10), first 20 events: \n"
-        f"{preview.to_string(index=False)}"
-    )
+    if preview_rows and preview_rows > 0:
+        preview = df_flat[size_cols + area_cols + disp_cols].head(preview_rows)
+        _logger.info(
+            "Sorted telescope sizes (pre-clip/log10), first %d events: \n%s",
+            preview_rows,
+            preview.to_string(index=False),
+        )
 
     apply_clip_intervals(
         df_flat,
@@ -1098,6 +1124,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
                 - _to_numpy_1d(df["Yoff_intersect"], np.float32)
             ).astype(np.float32),
             "DispNImages": _to_numpy_1d(df["DispNImages"], np.int32),
+            "img2_ang": _to_numpy_1d(df["img2_ang"], np.float32),
             # These may be absent in some datasets; if missing, fill with NaN
             "Erec": (
                 _to_numpy_1d(df["Erec"], np.float32)
diff --git a/src/eventdisplay_ml/features.py b/src/eventdisplay_ml/features.py
@@ -116,6 +116,7 @@ def _regression_features(training):
         "DispNImages",
         "DispTelList_T",
         "ImgSel_list",
+        "img2_ang",
         "Xoff",
         "Yoff",
         "Xoff_intersect",
@@ -182,8 +183,9 @@ def clip_intervals():
         "ErecS": (energy_min, None),
         "EChi2S": (energy_min, None),
         "EmissionHeightChi2": (1e-6, None),
+        "img2_ang": (0.0, 360.0),
         # Per-telescope energy and size variables - log10 transformation with lower bound
-        "size": (10, None),
+        "size": (1, None),
         "E": (energy_min, None),
         "ES": (energy_min, None),
         "ntubes": (1, None),
diff --git a/src/eventdisplay_ml/hyper_parameters.py b/src/eventdisplay_ml/hyper_parameters.py
@@ -88,11 +88,26 @@ def _load_hyper_parameters_from_file(config_file):
     return hyperparameters
 
 
-def pre_cuts_regression(n_tel):
-    """Get pre-cuts for regression analysis."""
-    event_cut = "DispNImages >=2"
+def pre_cuts_regression(n_tel, min_images=2):
+    """
+    Get pre-cuts for regression analysis.
+
+    Parameters
+    ----------
+    n_tel : int or None
+        Number of telescopes (not currently used).
+    min_images : int
+        Minimum number of images (DispNImages) for quality cut (default: 2).
+
+    Returns
+    -------
+    str or None
+        Pre-cut string for filtering events.
+    """
+    cuts = [f"DispNImages >={min_images}"]
     if PRE_CUTS_REGRESSION:
-        event_cut = " & ".join(f"({c})" for c in PRE_CUTS_REGRESSION)
+        cuts.extend(PRE_CUTS_REGRESSION)
+    event_cut = " & ".join(f"({c})" for c in cuts)
     _logger.info(f"Pre-cuts (regression): {event_cut if event_cut else 'None'}")
     return event_cut if event_cut else None
 
diff --git a/src/eventdisplay_ml/models.py b/src/eventdisplay_ml/models.py
@@ -252,6 +252,7 @@ def apply_regression_models(df, model_configs):
         training=False,
         tel_config=tel_config,
         observatory=model_configs.get("observatory", "veritas"),
+        preview_rows=model_configs.get("preview_rows", 20),
     )
 
     models = model_configs["models"]
@@ -313,6 +314,7 @@ def apply_classification_models(df, model_configs, threshold_keys):
             training=False,
             tel_config=tel_config,
             observatory=model_configs.get("observatory", "veritas"),
+            preview_rows=model_configs.get("preview_rows", 20),
         )
         model = models[e_bin]["model"]
         flatten_data = flatten_data.reindex(columns=models[e_bin]["features"])
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -43,6 +43,7 @@ def create_base_df(n_rows=2, n_tel=2):
             "Erec": np.arange(n_rows, dtype=float) * 10.0 + 10.0,
             "ErecS": np.arange(n_rows, dtype=float) * 5.0 + 5.0,
             "EmissionHeight": np.arange(n_rows, dtype=float) * 100.0 + 100.0,
+            "img2_ang": np.arange(n_rows, dtype=float) * 15.0 + 30.0,
         }
     )
 
@@ -94,6 +95,7 @@ def df_three_tel_missing():
             "Erec": [10.0],
             "ErecS": [5.0],
             "EmissionHeight": [100.0],
+            "img2_ang": [45.0],
         }
     )
 
@@ -146,6 +148,7 @@ def sample_df():
             "Erec": [100.0, 200.0, 300.0, 400.0],
             "ErecS": [90.0, 180.0, 270.0, 360.0],
             "EmissionHeight": [10.0, 11.0, 12.0, 13.0],
+            "img2_ang": [45.0, 50.0, 55.0, 60.0],
         }
     )
 
diff --git a/tests/test_imgsel_debug.py b/tests/test_imgsel_debug.py
@@ -47,6 +47,7 @@ def test_imgsel_sorting_and_alignment():
             "Erec": [1.0],
             "ErecS": [1.0],
             "EmissionHeight": [10.0],
+            "img2_ang": [45.0],
             "cosphi": [np.array([0.0, 0.0], dtype=float)],
             "sinphi": [np.array([1.0, 1.0], dtype=float)],
             "loss": [np.array([0.0, 0.0], dtype=float)],
diff --git a/tests/test_size_area_sort.py b/tests/test_size_area_sort.py
@@ -36,6 +36,7 @@ def test_mirror_area_then_size_sorting():
             "Erec": [1.0],
             "ErecS": [1.0],
             "EmissionHeight": [10.0],
+            "img2_ang": [45.0],
             # Supply base telescope arrays referenced in feature list; others will default to NaN
             "Disp_T": [np.array([0.0, 0.0, 0.0], dtype=float)],
             "cosphi": [np.array([1.0, 1.0, 1.0], dtype=float)],
@@ -59,12 +60,9 @@ def test_mirror_area_then_size_sorting():
     )
 
     # Expected telescope order: TelID 1 (area 100), TelID 2 (area 50, size 20), TelID 0 (area 50, size 10)
-    # Note: sizes are clipped to minimum 10 before sorting, so size 8→10
-    # After clipping: [10, 10, 20] at TelIDs [0, 1, 2]
-    # After sort by area (desc) then size (desc): [1, 2, 0]
-    # Result: [clipped[1]=10, clipped[2]=20, clipped[0]=10]
+    # Note: sizes are not clipped before sorting.
     expected_areas = [100.0, 50.0, 50.0]
-    expected_sizes = [np.log10(10.0), np.log10(20.0), np.log10(10.0)]  # After clipping
+    expected_sizes = [np.log10(8.0), np.log10(20.0), np.log10(10.0)]
     expected_rel_x = [1000.0, 2000.0, 0.0]
 
     np.testing.assert_allclose(

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Improve stereo reconstruction by adding the geometrical feature img2_ang.`
	`2`	`+Change clipping min for size to '1' (applicable for small images in SSTs).`
	`3`	`+Add preview_rows as command line parameter to allow flexible printout for debugging.`
Original file line number	Diff line number	Diff line change
`@@ -252,6 +252,7 @@ def apply_regression_models(df, model_configs):`
`252`	`252`	`training=False,`
`253`	`253`	`tel_config=tel_config,`
`254`	`254`	`observatory=model_configs.get("observatory", "veritas"),`
	`255`	`+ preview_rows=model_configs.get("preview_rows", 20),`
`255`	`256`	`)`
`256`	`257`
`257`	`258`	`models = model_configs["models"]`
`@@ -313,6 +314,7 @@ def apply_classification_models(df, model_configs, threshold_keys):`
`313`	`314`	`training=False,`
`314`	`315`	`tel_config=tel_config,`
`315`	`316`	`observatory=model_configs.get("observatory", "veritas"),`
	`317`	`+ preview_rows=model_configs.get("preview_rows", 20),`
`316`	`318`	`)`
`317`	`319`	`model = models[e_bin]["model"]`
`318`	`320`	`flatten_data = flatten_data.reindex(columns=models[e_bin]["features"])`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ def create_base_df(n_rows=2, n_tel=2):`
`43`	`43`	`"Erec": np.arange(n_rows, dtype=float) * 10.0 + 10.0,`
`44`	`44`	`"ErecS": np.arange(n_rows, dtype=float) * 5.0 + 5.0,`
`45`	`45`	`"EmissionHeight": np.arange(n_rows, dtype=float) * 100.0 + 100.0,`
	`46`	`+ "img2_ang": np.arange(n_rows, dtype=float) * 15.0 + 30.0,`
`46`	`47`	`}`
`47`	`48`	`)`
`48`	`49`
`@@ -94,6 +95,7 @@ def df_three_tel_missing():`
`94`	`95`	`"Erec": [10.0],`
`95`	`96`	`"ErecS": [5.0],`
`96`	`97`	`"EmissionHeight": [100.0],`
	`98`	`+ "img2_ang": [45.0],`
`97`	`99`	`}`
`98`	`100`	`)`
`99`	`101`
`@@ -146,6 +148,7 @@ def sample_df():`
`146`	`148`	`"Erec": [100.0, 200.0, 300.0, 400.0],`
`147`	`149`	`"ErecS": [90.0, 180.0, 270.0, 360.0],`
`148`	`150`	`"EmissionHeight": [10.0, 11.0, 12.0, 13.0],`
	`151`	`+ "img2_ang": [45.0, 50.0, 55.0, 60.0],`
`149`	`152`	`}`
`150`	`153`	`)`
`151`	`154`