Skip to content

Commit f72d11d

Browse files
Merge pull request #21 from MatthewSZhang/refine
FEAT add two-stage refine
2 parents 1d007de + bbb7678 commit f72d11d

File tree

7 files changed

+311
-16
lines changed

7 files changed

+311
-16
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ jobs:
4747
run: |
4848
FMT=xml pixi run test-coverage
4949
- name: Upload coverage reports to Codecov
50-
uses: codecov/[email protected].0
50+
uses: codecov/[email protected].2
5151
with:
5252
token: ${{ secrets.CODECOV_TOKEN }}
5353
- name: Build SDist

doc/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ API Reference
1818
:toctree: generated/
1919

2020
FastCan
21+
refine
2122
ssc
2223
ols
2324

fastcan/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33
"""
44

55
from ._fastcan import FastCan
6+
from ._refine import refine
67
from ._utils import ols, ssc
78

89
__all__ = [
910
"FastCan",
1011
"ssc",
1112
"ols",
13+
"refine",
1214
]

fastcan/_fastcan.py

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Feature selection
33
"""
44

5+
from copy import deepcopy
56
from numbers import Integral, Real
67

78
import numpy as np
@@ -66,15 +67,15 @@ class FastCan(SelectorMixin, BaseEstimator):
6667
The h-correlation/eta-cosine of selected features. The order of
6768
the scores is corresponding to the feature selection process.
6869
69-
X_transformed_ : ndarray of shape (n_samples_, n_features), dtype=float, order='F'
70+
X_transformed_ : ndarray of shape (`n_samples_`, n_features), dtype=float, order='F'
7071
Transformed feature matrix.
71-
When h-correlation method is used, n_samples_ = n_samples.
72-
When eta-cosine method is used, n_samples_ = n_features+n_outputs.
72+
When h-correlation method is used, `n_samples_` = n_samples.
73+
When eta-cosine method is used, `n_samples_` = n_features+n_outputs.
7374
74-
y_transformed_ : ndarray of shape (n_samples_, n_outputs), dtype=float, order='F'
75+
y_transformed_ : ndarray of shape (`n_samples_`, n_outputs), dtype=float, order='F'
7576
Transformed target matrix.
76-
When h-correlation method is used, n_samples_ = n_samples.
77-
When eta-cosine method is used, n_samples_ = n_features+n_outputs.
77+
When h-correlation method is used, `n_samples_` = n_samples.
78+
When eta-cosine method is used, `n_samples_` = n_features+n_outputs.
7879
7980
References
8081
----------
@@ -181,8 +182,26 @@ def fit(self, X, y):
181182
raise ValueError(
182183
"`eta` cannot be True, when n_samples < n_features+n_outputs."
183184
)
184-
indices_include = self._check_indices_params(self.indices_include, n_features)
185-
indices_exclude = self._check_indices_params(self.indices_exclude, n_features)
185+
self.indices_include_ = self._check_indices_params(
186+
self.indices_include, n_features
187+
)
188+
self.indices_exclude_ = self._check_indices_params(
189+
self.indices_exclude, n_features
190+
)
191+
if np.intersect1d(self.indices_include_, self.indices_exclude_).size != 0:
192+
raise ValueError(
193+
"`indices_include` and `indices_exclude` should not have intersection."
194+
)
195+
196+
n_candidates = (
197+
n_features - self.indices_exclude_.size - self.n_features_to_select
198+
)
199+
if n_candidates < 0:
200+
raise ValueError(
201+
"n_features - n_features_to_select - n_exclusions should >= 0."
202+
)
203+
if self.n_features_to_select - self.indices_include_.size < 0:
204+
raise ValueError("n_features_to_select - n_inclusions should >= 0.")
186205

187206
if self.eta:
188207
xy_hstack = np.hstack((X, y))
@@ -198,16 +217,16 @@ def fit(self, X, y):
198217
self.X_transformed_ = X - X.mean(0)
199218
self.y_transformed_ = orth(y - y.mean(0))
200219

201-
# initiated with -1
202-
indices = np.full(self.n_features_to_select, -1, dtype=np.intc, order="F")
203-
indices[: indices_include.size] = indices_include
204-
scores = np.zeros(self.n_features_to_select, dtype=float, order="F")
205-
mask = np.zeros(n_features, dtype=np.ubyte, order="F")
206-
mask[indices_exclude] = True
220+
indices, scores, mask = _prepare_search(
221+
n_features,
222+
self.n_features_to_select,
223+
self.indices_include_,
224+
self.indices_exclude_,
225+
)
207226

208227
n_threads = _openmp_effective_n_threads()
209228
_forward_search(
210-
X=self.X_transformed_,
229+
X=deepcopy(self.X_transformed_),
211230
V=self.y_transformed_,
212231
t=self.n_features_to_select,
213232
tol=self.tol,
@@ -259,3 +278,15 @@ def _check_indices_params(self, indices_params, n_features):
259278
def _get_support_mask(self):
260279
check_is_fitted(self)
261280
return self.support_
281+
282+
283+
def _prepare_search(n_features, n_features_to_select, indices_include, indices_exclude):
284+
""" """
285+
# initiated with -1
286+
indices = np.full(n_features_to_select, -1, dtype=np.intc, order="F")
287+
indices[: indices_include.size] = indices_include
288+
scores = np.zeros(n_features_to_select, dtype=float, order="F")
289+
mask = np.zeros(n_features, dtype=np.ubyte, order="F")
290+
mask[indices_exclude] = True
291+
292+
return indices, scores, mask

fastcan/_refine.py

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
"""
2+
Refine fastcan selection results
3+
"""
4+
5+
from copy import deepcopy
6+
from numbers import Integral
7+
8+
import numpy as np
9+
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
10+
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
11+
from sklearn.utils.validation import check_is_fitted
12+
13+
from ._cancorr_fast import _forward_search # type: ignore
14+
from ._fastcan import FastCan, _prepare_search
15+
16+
17+
@validate_params(
18+
{
19+
"selector": [FastCan],
20+
"drop": [
21+
Interval(Integral, 1, None, closed="left"),
22+
StrOptions({"all"}),
23+
"array-like",
24+
],
25+
"max_iter": [
26+
None,
27+
Interval(Integral, 1, None, closed="left"),
28+
],
29+
"verbose": ["verbose"],
30+
},
31+
prefer_skip_nested_validation=True,
32+
)
33+
def refine(selector, drop=1, max_iter=None, verbose=1):
34+
"""Two-Stage Refining.
35+
36+
In the refining process, the selected features will be dropped, and
37+
the vacancy positions will be refilled from the candidate features.
38+
39+
The processing of a vacany position is refilled after searching all
40+
candidate features is called an `iteration`.
41+
42+
The processing of a vacany position is refilled by a different features
43+
from the dropped one, which increase the SSC of the selected features
44+
is called a `valid iteration`.
45+
46+
Parameters
47+
----------
48+
selector : FastCan
49+
FastCan selector.
50+
51+
drop : int or array-like of shape (n_drops,) or "all", default=1
52+
The number of the selected features dropped for the consequencing
53+
reselection.
54+
55+
max_iter : int, default=None
56+
The maximum number of valid iterations in the refining process.
57+
58+
verbose : int, default=1
59+
The verbosity level.
60+
61+
Returns
62+
-------
63+
indices : ndarray of shape (n_features_to_select,), dtype=int
64+
The indices of the selected features.
65+
66+
scores : ndarray of shape (n_features_to_select,), dtype=float
67+
The h-correlation/eta-cosine of selected features.
68+
69+
References
70+
----------
71+
* Zhang L., Li K., Bai E. W. and Irwin G. W. (2015).
72+
Two-stage orthogonal least squares methods for neural network construction.
73+
IEEE Transactions on Neural Networks and Learning Systems, 26(8), 1608-1621.
74+
75+
Examples
76+
--------
77+
>>> from fastcan import FastCan, refine
78+
>>> X = [[1, 1, 0], [0.01, 0, 0], [-1, 0, 1], [0, 0, 0]]
79+
>>> y = [1, 0, -1, 0]
80+
>>> selector = FastCan(2, verbose=0).fit(X, y)
81+
>>> print(f"Indices: {selector.indices_}", f", SSC: {selector.scores_.sum():.5f}")
82+
Indices: [0 1] , SSC: 0.99998
83+
>>> indices, scores = refine(selector, drop=1, verbose=0)
84+
>>> print(f"Indices: {indices}", f", SSC: {scores.sum():.5f}")
85+
Indices: [1 2] , SSC: 1.00000
86+
"""
87+
check_is_fitted(selector)
88+
X_transformed_ = deepcopy(selector.X_transformed_)
89+
n_features = selector.n_features_in_
90+
n_features_to_select = selector.n_features_to_select
91+
indices_include = selector.indices_include_
92+
indices_exclude = selector.indices_exclude_
93+
94+
n_inclusions = indices_include.size
95+
n_selections = n_features_to_select - n_inclusions
96+
97+
if drop == "all":
98+
drop = np.arange(1, n_selections)
99+
else:
100+
drop = np.atleast_1d(drop).astype(int)
101+
102+
if (drop.max() >= n_selections) or (drop.min() < 1):
103+
raise ValueError(
104+
"`drop` should be between `1<=drop<n_features_to_select-n_inclusions`, "
105+
f"but got drop={drop} and n_selections={n_selections}."
106+
)
107+
108+
if max_iter is None:
109+
max_iter = np.inf
110+
111+
n_iters = 0
112+
n_valid_iters = 0
113+
best_scores = selector.scores_
114+
best_indices = selector.indices_
115+
best_ssc = selector.scores_.sum()
116+
indices_temp = best_indices
117+
for drop_n in drop:
118+
i = 0
119+
while i < n_features:
120+
rolled_indices = np.r_[
121+
indices_include, np.roll(indices_temp[n_inclusions:], -1)
122+
]
123+
indices, scores, mask = _prepare_search(
124+
n_features,
125+
n_features_to_select,
126+
rolled_indices[:-drop_n],
127+
indices_exclude,
128+
)
129+
n_threads = _openmp_effective_n_threads()
130+
_forward_search(
131+
X=X_transformed_,
132+
V=selector.y_transformed_,
133+
t=selector.n_features_to_select,
134+
tol=selector.tol,
135+
num_threads=n_threads,
136+
verbose=0,
137+
mask=mask,
138+
indices=indices,
139+
scores=scores,
140+
)
141+
142+
if (scores.sum() > best_ssc) and (set(indices) != set(best_indices)):
143+
i = 0
144+
n_valid_iters += 1
145+
best_scores = scores
146+
best_indices = indices
147+
best_ssc = scores.sum()
148+
else:
149+
i += 1
150+
151+
indices_temp = indices
152+
n_iters += 1
153+
if verbose == 1:
154+
print(
155+
f"No. of iterations: {n_iters}, "
156+
f"No. of valid iterations {n_valid_iters}, "
157+
f"SSC: {best_scores.sum():.5f}",
158+
end="\r",
159+
)
160+
161+
if n_iters >= max_iter:
162+
if verbose == 1:
163+
print()
164+
return best_indices, best_scores
165+
166+
if verbose == 1:
167+
print()
168+
return best_indices, best_scores

tests/test_fastcan.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,20 @@ def test_raise_errors():
199199
indices_include=[[0]]
200200
)
201201

202+
selector_include_exclude_intersect = FastCan(
203+
n_features_to_select=n_features,
204+
indices_include=[0, 1],
205+
indices_exclude=[1, 2],
206+
)
207+
selector_n_candidates = FastCan(
208+
n_features_to_select=n_features,
209+
indices_exclude=[1, 2],
210+
)
211+
selector_too_many_inclusions = FastCan(
212+
n_features_to_select=2,
213+
indices_include=[1, 2, 3],
214+
)
215+
202216
with pytest.raises(ValueError, match=r"n_features_to_select .*"):
203217
selector_n_select.fit(X, y)
204218

@@ -214,6 +228,15 @@ def test_raise_errors():
214228
with pytest.raises(ValueError, match=r"`eta` cannot be True, .*"):
215229
selector_eta_for_small_size_samples.fit(X, y)
216230

231+
with pytest.raises(ValueError, match=r"`indices_include` and `indices_exclu.*"):
232+
selector_include_exclude_intersect.fit(X, y)
233+
234+
with pytest.raises(ValueError, match=r"n_features - n_features_to_select - n_e.*"):
235+
selector_n_candidates.fit(X, y)
236+
237+
with pytest.raises(ValueError, match=r"n_features_to_select - n_inclusions sho.*"):
238+
selector_too_many_inclusions.fit(X, y)
239+
217240

218241
@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
219242
def test_cython_errors():

0 commit comments

Comments
 (0)