Skip to content

Commit 0840101

Browse files
Merge pull request #19 from MatthewSZhang/exclude
FEAT add indices_exclude params
2 parents 6eda8c7 + a74154a commit 0840101

File tree

4 files changed

+807
-797
lines changed

4 files changed

+807
-797
lines changed

fastcan/_cancorr_fast.pyx

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@ from cython.parallel import prange
88
from scipy.linalg.cython_blas cimport isamax, idamax
99
from sklearn.utils._cython_blas cimport ColMajor, NoTrans
1010
from sklearn.utils._cython_blas cimport _dot, _scal, _nrm2, _gemm, _axpy
11-
from sklearn.utils._typedefs cimport int32_t
11+
from sklearn.utils._typedefs cimport int32_t, uint8_t
1212

1313

1414
@final
1515
cdef int _bsum(
16-
const bint* x,
16+
const uint8_t* x,
1717
int n,
1818
) noexcept nogil:
1919
"""Computes the sum of the vector of bool elements.
@@ -129,6 +129,7 @@ cpdef int _forward_search(
129129
floating tol, # IN
130130
int num_threads, # IN
131131
int verbose, # IN
132+
uint8_t[::1] mask, # IN/TEMP
132133
int32_t[::1] indices, # OUT
133134
floating[::1] scores, # OUT
134135
) except -1 nogil:
@@ -140,6 +141,7 @@ cpdef int _forward_search(
140141
is orthonormal to selected features and M.
141142
t : Non-negative integer. The number of features to be selected.
142143
tol : Tolerance for linear dependence check.
144+
mask (n_features, ) Mask for candidate features.
143145
indices: (t,) The indices vector of selected features, initiated with -1.
144146
scores: (t,) The h-correlation/eta-cosine of selected features.
145147
"""
@@ -149,7 +151,6 @@ cpdef int _forward_search(
149151
# OpenMP (in Windows) requires signed integral for prange
150152
int n_features = X.shape[1]
151153
floating* r2 = <floating*> malloc(sizeof(floating) * n_features)
152-
bint* mask = <bint*> malloc(sizeof(bint) * n_features)
153154
floating g, ssc = 0.0
154155
int i, j
155156
int index = -1
@@ -160,7 +161,8 @@ cpdef int _forward_search(
160161
if i == 0:
161162
# Preprocessing
162163
for j in range(n_features):
163-
mask[j] = _normv(&X[0, j], n_samples)
164+
if not mask[j]:
165+
mask[j] = _normv(&X[0, j], n_samples)
164166
else:
165167
mask[index] = True
166168
r2[index] = 0
@@ -204,5 +206,4 @@ cpdef int _forward_search(
204206
with gil:
205207
print()
206208
free(r2)
207-
free(mask)
208209
return 0

fastcan/_fastcan.py

Lines changed: 60 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ class FastCan(SelectorMixin, BaseEstimator):
2929
indices_include : array-like of shape (n_inclusions,), default=None
3030
The indices of the prerequisite features.
3131
32+
indices_exclude : array-like of shape (n_exclusions,), default=None
33+
The indices of the excluded features.
34+
3235
eta : bool, default=False
3336
Whether to use eta-cosine method.
3437
@@ -63,6 +66,16 @@ class FastCan(SelectorMixin, BaseEstimator):
6366
The h-correlation/eta-cosine of selected features. The order of
6467
the scores is corresponding to the feature selection process.
6568
69+
X_transformed_ : ndarray of shape (n_samples_, n_features), dtype=float, order='F'
70+
Transformed feature matrix.
71+
When h-correlation method is used, n_samples_ = n_samples.
72+
When eta-cosine method is used, n_samples_ = n_features+n_outputs.
73+
74+
y_transformed_ : ndarray of shape (n_samples_, n_outputs), dtype=float, order='F'
75+
Transformed target matrix.
76+
When h-correlation method is used, n_samples_ = n_samples.
77+
When eta-cosine method is used, n_samples_ = n_features+n_outputs.
78+
6679
References
6780
----------
6881
* Zhang, S., & Lang, Z. Q. (2022).
@@ -88,6 +101,7 @@ class FastCan(SelectorMixin, BaseEstimator):
88101
Interval(Integral, 1, None, closed="left"),
89102
],
90103
"indices_include": [None, "array-like"],
104+
"indices_exclude": [None, "array-like"],
91105
"eta": ["boolean"],
92106
"tol": [Interval(Real, 0, None, closed="neither")],
93107
"verbose": ["verbose"],
@@ -97,12 +111,14 @@ def __init__(
97111
self,
98112
n_features_to_select=1,
99113
indices_include=None,
114+
indices_exclude=None,
100115
eta=False,
101116
tol=0.01,
102117
verbose=1,
103118
):
104119
self.n_features_to_select = n_features_to_select
105120
self.indices_include = indices_include
121+
self.indices_exclude = indices_exclude
106122
self.eta = eta
107123
self.tol = tol
108124
self.verbose = verbose
@@ -152,17 +168,6 @@ def fit(self, X, y):
152168
# [:, np.newaxis] that does not.
153169
y = y.reshape(-1, 1)
154170

155-
# indices_include
156-
if self.indices_include is None:
157-
indices_include = np.zeros(0, dtype=int)
158-
else:
159-
indices_include = check_array(
160-
self.indices_include,
161-
ensure_2d=False,
162-
dtype=int,
163-
ensure_min_samples=0,
164-
)
165-
166171
n_samples, n_features = X.shape
167172
n_outputs = y.shape[1]
168173

@@ -172,29 +177,12 @@ def fit(self, X, y):
172177
f"must be <= n_features {n_features}."
173178
)
174179

175-
if indices_include.ndim != 1:
176-
raise ValueError(
177-
f"Found indices_include with dim {indices_include.ndim}, "
178-
"but expected == 1."
179-
)
180-
181-
if indices_include.size >= n_features:
182-
raise ValueError(
183-
f"n_inclusions {indices_include.size} must "
184-
f"be < n_features {n_features}."
185-
)
186-
187-
if np.any((indices_include < 0) | (indices_include >= n_features)):
188-
raise ValueError(
189-
"Out of bounds. "
190-
f"All items in indices_include should be in [0, {n_features}). "
191-
f"But got indices_include = {indices_include}."
192-
)
193-
194180
if (n_samples < n_features + n_outputs) and self.eta:
195181
raise ValueError(
196182
"`eta` cannot be True, when n_samples < n_features+n_outputs."
197183
)
184+
indices_include = self._check_indices_params(self.indices_include, n_features)
185+
indices_exclude = self._check_indices_params(self.indices_exclude, n_features)
198186

199187
if self.eta:
200188
xy_hstack = np.hstack((X, y))
@@ -204,23 +192,28 @@ def fit(self, X, y):
204192
)[1:]
205193
qxy_transformed = singular_values.reshape(-1, 1) * unitary_arrays
206194
qxy_transformed = np.asfortranarray(qxy_transformed)
207-
X_transformed = qxy_transformed[:, :n_features]
208-
y_transformed = orth(qxy_transformed[:, n_features:])
195+
self.X_transformed_ = qxy_transformed[:, :n_features]
196+
self.y_transformed_ = orth(qxy_transformed[:, n_features:])
209197
else:
210-
X_transformed = X - X.mean(0)
211-
y_transformed = orth(y - y.mean(0))
198+
self.X_transformed_ = X - X.mean(0)
199+
self.y_transformed_ = orth(y - y.mean(0))
200+
201+
# initiated with -1
202+
indices = np.full(self.n_features_to_select, -1, dtype=np.intc, order="F")
203+
indices[: indices_include.size] = indices_include
204+
scores = np.zeros(self.n_features_to_select, dtype=float, order="F")
205+
mask = np.zeros(n_features, dtype=np.ubyte, order="F")
206+
mask[indices_exclude] = True
212207

213-
indices, scores = self._prepare_data(
214-
indices_include,
215-
)
216208
n_threads = _openmp_effective_n_threads()
217209
_forward_search(
218-
X=X_transformed,
219-
V=y_transformed,
210+
X=self.X_transformed_,
211+
V=self.y_transformed_,
220212
t=self.n_features_to_select,
221213
tol=self.tol,
222214
num_threads=n_threads,
223215
verbose=self.verbose,
216+
mask=mask,
224217
indices=indices,
225218
scores=scores,
226219
)
@@ -231,34 +224,37 @@ def fit(self, X, y):
231224
self.scores_ = scores
232225
return self
233226

234-
def _prepare_data(self, indices_include):
235-
"""Prepare data for _forward_search()
236-
When h-correlation method is used, n_samples_ = n_samples.
237-
When eta-cosine method is used, n_samples_ = n_features+n_outputs.
238-
239-
Parameters
240-
----------
241-
indices_include : array-like of shape (n_inclusions,), dtype=int
242-
The indices of the prerequisite features.
227+
def _check_indices_params(self, indices_params, n_features):
228+
"""Check indices_include or indices_exclude."""
229+
if indices_params is None:
230+
indices_params = np.zeros(0, dtype=int)
231+
else:
232+
indices_params = check_array(
233+
indices_params,
234+
ensure_2d=False,
235+
dtype=int,
236+
ensure_min_samples=0,
237+
)
243238

244-
Returns
245-
-------
246-
mask : ndarray of shape (n_features,), dtype=np.ubyte, order='F'
247-
Mask for invalid candidate features.
248-
The data type is unsigned char.
239+
if indices_params.ndim != 1:
240+
raise ValueError(
241+
f"Found indices_params with dim {indices_params.ndim}, "
242+
"but expected == 1."
243+
)
249244

250-
indices: ndarray of shape (n_features_to_select,), dtype=np.intc, order='F'
251-
The indices vector of selected features, initiated with -1.
252-
The data type is signed int.
245+
if indices_params.size >= n_features:
246+
raise ValueError(
247+
f"The number of indices in indices_params {indices_params.size} must "
248+
f"be < n_features {n_features}."
249+
)
253250

254-
scores: ndarray of shape (n_features_to_select,), dtype=float, order='F'
255-
The h-correlation/eta-cosine of selected features.
256-
"""
257-
# initiated with -1
258-
indices = np.full(self.n_features_to_select, -1, dtype=np.intc, order="F")
259-
indices[: indices_include.size] = indices_include
260-
scores = np.zeros(self.n_features_to_select, dtype=float, order="F")
261-
return indices, scores
251+
if np.any((indices_params < 0) | (indices_params >= n_features)):
252+
raise ValueError(
253+
"Out of bounds. "
254+
f"All items in indices_params should be in [0, {n_features}). "
255+
f"But got indices_params = {indices_params}."
256+
)
257+
return indices_params
262258

263259
def _get_support_mask(self):
264260
check_is_fitted(self)

0 commit comments

Comments
 (0)