Add SampleSet.as_regions() method.

tymorrow · tymorrow · commit 1990a9aa1e84 · 2025-02-13T15:06:53.000-07:00
diff --git a/riid/data/sampleset.py b/riid/data/sampleset.py
@@ -340,7 +340,7 @@ def difficulty_score(self, mean=10.0, std=3.0) -> float:
     @property
     def ecal(self):
         """Get or set the ecal terms."""
-        ecal_terms = self.info[list(self.ECAL_INFO_COLUMNS)].values
+        ecal_terms = self.info[list(self.ECAL_INFO_COLUMNS)].to_numpy(dtype=float)
         return ecal_terms
 
     @ecal.setter
@@ -543,6 +543,9 @@ def as_ecal(self, new_offset: float, new_gain: float,
             new_cubic: new cubic value, i.e. the 3-th e-cal term
             new_low_energy: new low energy term
 
+        Returns:
+            A new `SamleSet` with `spectra` and `info` DataFrames
+
         Raises:
             `ValueError` when no argument values are provided
         """
@@ -589,6 +592,48 @@ def as_ecal(self, new_offset: float, new_gain: float,
         new_ss.info[ecal_cols] = new_ecal
         return new_ss
 
+    def as_regions(self, rois: list) -> SampleSet:
+        """Obtains a new `SampleSet` where the spectra are limited to specific
+        regions of interest (ROIs).
+
+        Notes:
+            - If your samples have disparate energy calibration terms, call `as_ecal()` first
+              to align channel space, then you may call this function. Otherwise, it is possible
+              to end up with a ragged array of spectra, which we do not support.
+            - After this call, `spectra` will have columns filled in with energy values for
+              convenience. As such, in the context of the returned `SampleSet`, the energy
+              calibration terms in `info` will no longer have any meaning, and any subsequent
+              calls to methods like `as_ecal()` would not make sense.  This method is intended
+              as a last step to be performed right before analysis of whatever kind.
+
+        Args:
+            rois: a list of 2-tuples where tuple represents (low energy, high energy)
+
+        Returns:
+            A new `SamleSet` with only ROIs remaining in the `spectra` DataFrame
+
+        Raises:
+            `ValueError` when no argument values are provided
+        """
+        if not rois:
+            raise ValueError("At least one ROI must be provided.")
+        all_ecals = self.ecal
+        all_ecals_are_same = np.isclose(all_ecals, all_ecals[0]).all()
+        if not all_ecals_are_same:
+            msg = "Spectra have different energy calibrations; consider `as_ecal()` first."
+            raise ValueError(msg)
+
+        energies = self.get_channel_energies(0)
+        mask = _get_energy_roi_masks(rois, energies)
+        new_spectra = self.spectra.to_numpy(dtype=float)[:, mask]
+        new_spectra = new_spectra.reshape((self.n_samples, -1))
+        mask_energies = energies[mask]
+
+        new_ss = self[:]
+        new_ss.spectra = pd.DataFrame(new_spectra, columns=mask_energies)
+        new_ss.info.total_counts = new_ss.spectra.sum(axis=1)
+        return new_ss
+
     def check_seed_health(self, dead_time_threshold=1.0):
         """Checks health of all spectra and info assuming they are seeds.
 
@@ -1905,6 +1950,14 @@ def _get_distance_df_from_values(distance_values: np.ndarray,
     return distance_df
 
 
+def _get_energy_roi_masks(rois: list, energies: np.ndarray) -> np.ndarray:
+    masks = np.zeros(energies.shape, dtype=bool)
+    for (elow, ehigh) in rois:
+        roi_mask = (elow <= energies) & (energies < ehigh)
+        masks |= roi_mask
+    return masks
+
+
 class InvalidSampleSetFileError(Exception):
     """Missing or invalid keys in a file being read in as a `SampleSet`."""
     pass
diff --git a/tests/sampleset_tests.py b/tests/sampleset_tests.py
@@ -894,7 +894,7 @@ def test_get_confidences(self):
             return_gross=True,
             rng=rng
         )
-        _, synthetic_gross_ss = synth.generate(fg_seeds_ss, bg_seeds_ss[0])
+        _, synthetic_gross_ss = synth.generate(fg_seeds_ss, bg_seeds_ss[0], verbose=False)
         synthetic_gross_ss.drop_sources(bg_seeds_ss.sources.columns.levels[2])
         synthetic_gross_ss.sources = synthetic_gross_ss.sources[fg_seeds_ss.sources.columns]
         synthetic_gross_ss.prediction_probas = pd.DataFrame(
@@ -907,7 +907,8 @@ def test_get_confidences(self):
             bg_cps=synth.bg_cps
         )
 
-        _, synthetic_mixed_gross_ss = synth.generate(mixed_fg_seeds_ss, bg_seeds_ss[0])
+        _, synthetic_mixed_gross_ss = synth.generate(mixed_fg_seeds_ss, bg_seeds_ss[0],
+                                                     verbose=False)
         synthetic_mixed_gross_ss.drop_sources(bg_seeds_ss.sources.columns.levels[2])
         synthetic_mixed_gross_ss.sources = synthetic_mixed_gross_ss.sources[
             fg_seeds_ss.sources.columns
@@ -975,6 +976,60 @@ def test_get_confidences(self):
                 bg_cps=None
             )
 
+    def test_get_energy_roi_masks(self):
+        from riid.data.sampleset import _get_energy_roi_masks
+        ROIS1 = [
+            (0, 2),
+            (4, 8),
+        ]
+        ROIS2 = [
+            (0, 0.75),
+            (2.0, 2.5),
+            (3.0, 5.0),
+        ]
+        ENERGIES = np.array([
+            [0, 1, 2, 3, 4, 5, 6],
+            [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
+        ])
+        EXPECTED_MASKS1 = np.array([
+            [True, True, False, False, True, True, True],
+            [True, True, True, True, False, False, False],
+        ])
+        EXPECTED_MASKS2 = np.array([
+            [True, False, True, True, True, False, False],
+            [True, True, False, False, True, False, True],
+        ])
+        masks1 = _get_energy_roi_masks(ROIS1, ENERGIES)
+        masks2 = _get_energy_roi_masks(ROIS2, ENERGIES)
+
+        self.assertTrue(np.array_equal(masks1, EXPECTED_MASKS1))
+        self.assertTrue(np.array_equal(masks2, EXPECTED_MASKS2))
+
+    def test_as_regions(self):
+        from riid.data.sampleset import _get_energy_roi_masks
+        ROIS = [
+            (0, 100),
+            (500, 550),
+            (2400, 2600),
+        ]
+        ss1 = get_dummy_seeds(n_channels=1000)
+        ss2 = get_dummy_seeds(n_channels=1000).as_ecal(0, 2500, 0, 0, 0)
+        ss3 = get_dummy_seeds(n_channels=500).as_ecal(20, 2000, 0, 0, 0)
+
+        with self.assertRaises(ValueError):
+            ss1.as_regions([])
+
+        ss_mixed_ecal = SampleSet()
+        ss_mixed_ecal.concat([ss1, ss2])
+        with self.assertRaises(ValueError):
+            ss_mixed_ecal.as_regions(ROIS)
+
+        for ss in [ss1, ss2, ss3]:
+            channel_energies = ss.get_channel_energies(0)
+            ss_channels_expected = _get_energy_roi_masks(ROIS, channel_energies).sum()
+            rois = ss.as_regions(ROIS)
+            self.assertEqual(rois.n_channels, ss_channels_expected)
+
     def _assert_row_labels(self, level, actual, expected):
         for i, (a, e) in enumerate(zip(actual, expected)):
             self.assertEqual(