From 3768c198530f9b242966e42bbc2175800f0ec54f Mon Sep 17 00:00:00 2001
From: Joseph Abbott <joseph.william.abbott@gmail.com>
Date: Tue, 26 Mar 2024 18:49:32 +0000
Subject: [PATCH 1/4] Write failing tests for when there are non-unique
 selections for zero score

---
 tests/test_feature_pcov_cur.py   |  2 +-
 tests/test_feature_simple_fps.py | 17 +++++++++++++++++
 tests/test_sample_simple_cur.py  | 17 +++++++++++++++++
 tests/test_sample_simple_fps.py  | 18 ++++++++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/tests/test_feature_pcov_cur.py b/tests/test_feature_pcov_cur.py
index d1f22b358..d57dfcd39 100644
--- a/tests/test_feature_pcov_cur.py
+++ b/tests/test_feature_pcov_cur.py
@@ -41,7 +41,7 @@ def test_non_it(self):
         self.idx = [2, 8, 3, 6, 7, 9, 1, 0, 5]
         selector = PCovCUR(n_to_select=9, recompute_every=0)
         selector.fit(self.X, self.y)
-
+        print(selector.selected_idx_)
         self.assertTrue(np.allclose(selector.selected_idx_, self.idx))
 
 
diff --git a/tests/test_feature_simple_fps.py b/tests/test_feature_simple_fps.py
index b29a2bc7b..8d7d40304 100644
--- a/tests/test_feature_simple_fps.py
+++ b/tests/test_feature_simple_fps.py
@@ -1,5 +1,6 @@
 import unittest
 
+import numpy as np
 from sklearn.datasets import load_diabetes as get_dataset
 from sklearn.utils.validation import NotFittedError
 
@@ -62,6 +63,22 @@ def test_get_distances(self):
             selector = FPS(n_to_select=7)
             _ = selector.get_select_distance()
 
+    def test_unique_selected_idx_zero_score(self):
+        """
+        Tests that the selected idxs are unique, which may not be the
+        case when the score is numerically zero
+        """
+        np.random.seed(0)
+        n_samples = 10
+        n_features = 15
+        X = np.random.rand(n_samples, n_features)
+        X[:, 3] = np.random.rand(10) * 1e-13
+        X[:, 4] = np.random.rand(10) * 1e-13
+        selector_problem = FPS(n_to_select=len(X.T)).fit(X)
+        assert len(selector_problem.selected_idx_) == len(
+            set(selector_problem.selected_idx_)
+        )
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/tests/test_sample_simple_cur.py b/tests/test_sample_simple_cur.py
index b3a9437e1..ab45dc825 100644
--- a/tests/test_sample_simple_cur.py
+++ b/tests/test_sample_simple_cur.py
@@ -58,6 +58,23 @@ def test_non_it(self):
 
         self.assertTrue(np.allclose(selector.selected_idx_, ref_idx))
 
+    def test_unique_selected_idx_zero_score(self):
+        """
+        Tests that the selected idxs are unique, which may not be the
+        case when the score is numerically zero.
+        """
+        np.random.seed(0)
+        n_samples = 10
+        n_features = 15
+        X = np.random.rand(n_samples, n_features)
+        X[4, :] = np.random.rand(15) * 1e-13
+        X[5, :] = np.random.rand(15) * 1e-13
+        X[6, :] = np.random.rand(15) * 1e-13
+        selector_problem = CUR(n_to_select=len(X)).fit(X)
+        assert len(selector_problem.selected_idx_) == len(
+            set(selector_problem.selected_idx_)
+        )
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/tests/test_sample_simple_fps.py b/tests/test_sample_simple_fps.py
index ca7ee4bee..afd4d2111 100644
--- a/tests/test_sample_simple_fps.py
+++ b/tests/test_sample_simple_fps.py
@@ -1,5 +1,6 @@
 import unittest
 
+import numpy as np
 from sklearn.datasets import load_diabetes as get_dataset
 from sklearn.utils.validation import NotFittedError
 
@@ -81,6 +82,23 @@ def test_threshold(self):
         self.assertEqual(len(selector.selected_idx_), 5)
         self.assertEqual(selector.selected_idx_.tolist(), self.idx[:5])
 
+    def test_unique_selected_idx_zero_score(self):
+        """
+        Tests that the selected idxs are unique, which may not be the
+        case when the score is numerically zero.
+        """
+        np.random.seed(0)
+        n_samples = 10
+        n_features = 15
+        X = np.random.rand(n_samples, n_features)
+        X[4, :] = np.random.rand(15) * 1e-13
+        X[5, :] = np.random.rand(15) * 1e-13
+        X[6, :] = np.random.rand(15) * 1e-13
+        selector_problem = FPS(n_to_select=len(X)).fit(X)
+        assert len(selector_problem.selected_idx_) == len(
+            set(selector_problem.selected_idx_)
+        )
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)

From 721622087e1a8157d34afb0abc3ff67da6ac77b7 Mon Sep 17 00:00:00 2001
From: Joseph Abbott <joseph.william.abbott@gmail.com>
Date: Tue, 26 Mar 2024 18:57:12 +0000
Subject: [PATCH 2/4] Remove redundant line

---
 src/skmatter/_selection.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py
index 95e43ed15..daa1894cb 100644
--- a/src/skmatter/_selection.py
+++ b/src/skmatter/_selection.py
@@ -236,8 +236,6 @@ def fit(self, X, y=None, warm_start=False):
         n_to_select_from = X.shape[self._axis]
         self.n_samples_in_, self.n_features_in_ = X.shape
 
-        self.n_samples_in_, self.n_features_in_ = X.shape
-
         error_msg = (
             "n_to_select must be either None, an "
             f"integer in [1, n_{self.selection_type}s] "

From a1ce7720da32624b6a6b331122830838aac31d78 Mon Sep 17 00:00:00 2001
From: Joseph Abbott <joseph.william.abbott@gmail.com>
Date: Tue, 26 Mar 2024 19:08:40 +0000
Subject: [PATCH 3/4] Partial fix: only select idxs that have been previously
 selected

---
 src/skmatter/_selection.py     | 6 +++++-
 tests/test_feature_pcov_cur.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py
index daa1894cb..cd5f86569 100644
--- a/src/skmatter/_selection.py
+++ b/src/skmatter/_selection.py
@@ -437,7 +437,11 @@ def _continue_greedy_search(self, X, y, n_to_select):
     def _get_best_new_selection(self, scorer, X, y):
         scores = scorer(X, y)
 
-        max_score_idx = np.argmax(scores)
+        # Get the score argmax, but only for idxs not already selected
+        _tmp_scores = {
+            i: score for i, score in enumerate(scores) if i not in self.selected_idx_
+        }
+        max_score_idx = max(_tmp_scores, key=_tmp_scores.get)
         if self.score_threshold is not None:
             if self.first_score_ is None:
                 self.first_score_ = scores[max_score_idx]
diff --git a/tests/test_feature_pcov_cur.py b/tests/test_feature_pcov_cur.py
index d57dfcd39..d1f22b358 100644
--- a/tests/test_feature_pcov_cur.py
+++ b/tests/test_feature_pcov_cur.py
@@ -41,7 +41,7 @@ def test_non_it(self):
         self.idx = [2, 8, 3, 6, 7, 9, 1, 0, 5]
         selector = PCovCUR(n_to_select=9, recompute_every=0)
         selector.fit(self.X, self.y)
-        print(selector.selected_idx_)
+
         self.assertTrue(np.allclose(selector.selected_idx_, self.idx))
 
 

From 81177953e2fe320b0f24311caa8673cbbed7be11 Mon Sep 17 00:00:00 2001
From: Joseph Abbott <joseph.william.abbott@gmail.com>
Date: Tue, 26 Mar 2024 19:10:13 +0000
Subject: [PATCH 4/4] CUR feature selection test

---
 tests/test_feature_simple_cur.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/test_feature_simple_cur.py b/tests/test_feature_simple_cur.py
index 72554471d..4e49d643b 100644
--- a/tests/test_feature_simple_cur.py
+++ b/tests/test_feature_simple_cur.py
@@ -46,6 +46,22 @@ def test_non_it(self):
 
         self.assertTrue(np.allclose(selector.selected_idx_, ref_idx))
 
+    def test_unique_selected_idx_zero_score(self):
+        """
+        Tests that the selected idxs are unique, which may not be the
+        case when the score is numerically zero
+        """
+        np.random.seed(0)
+        n_samples = 10
+        n_features = 15
+        X = np.random.rand(n_samples, n_features)
+        X[:, 3] = np.random.rand(10) * 1e-13
+        X[:, 4] = np.random.rand(10) * 1e-13
+        selector_problem = CUR(n_to_select=len(X.T)).fit(X)
+        assert len(selector_problem.selected_idx_) == len(
+            set(selector_problem.selected_idx_)
+        )
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)