Skip to content

Commit 293f6be

Browse files
MNT fix minibatch X_transformed_ changes per loop (#39)
1 parent 0b38856 commit 293f6be

File tree

5 files changed

+566
-108
lines changed

5 files changed

+566
-108
lines changed

fastcan/_fastcan.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ class FastCan(SelectorMixin, BaseEstimator):
117117
def __init__(
118118
self,
119119
n_features_to_select=1,
120+
*,
120121
indices_include=None,
121122
indices_exclude=None,
122123
eta=False,

fastcan/_minibatch.py

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from sklearn.utils.validation import check_X_y
1212

1313
from ._cancorr_fast import _forward_search # type: ignore
14-
from ._fastcan import FastCan, _prepare_search
14+
from ._fastcan import _prepare_search
1515

1616

1717
@validate_params(
@@ -26,17 +26,21 @@
2626
],
2727
"verbose": ["verbose"],
2828
},
29-
prefer_skip_nested_validation=False,
29+
prefer_skip_nested_validation=True,
3030
)
3131
def minibatch(X, y, n_features_to_select=1, batch_size=1, verbose=1):
3232
"""Feature selection using :class:`fastcan.FastCan` with mini batches.
3333
3434
It is suitable for selecting a very large number of features
3535
even larger than the number of samples.
3636
37-
Similar to the correlation filter which selects each feature without considering
38-
the redundancy, the function selects features in mini-batch and the
39-
redundancy between the two mini-batches will be ignored.
37+
The function splits `n_features_to_select` into `n_outputs` parts and selects
38+
features for each part separately, ignoring the redundancy among outputs.
39+
In each part, the function selects features batch-by-batch. The batch size is less
40+
than or equal to `batch_size`.
41+
Like correlation filters, which select features one-by-one without considering
42+
the redundancy between two features, the function ignores the redundancy between
43+
two mini-batches.
4044
4145
Parameters
4246
----------
@@ -70,7 +74,7 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, verbose=1):
7074
>>> print(f"Indices: {indices}")
7175
Indices: [0 1 2]
7276
"""
73-
X, y = check_X_y(X, y, ensure_2d=True, multi_output=True)
77+
X, y = check_X_y(X, y, ensure_2d=True, multi_output=True, order="F")
7478
if y.ndim == 1:
7579
y = y.reshape(-1, 1)
7680

@@ -90,41 +94,35 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, verbose=1):
9094
0, n_features_to_select, num=n_outputs + 1, endpoint=True, dtype=int
9195
)
9296
)
97+
X_transformed_ = X - X.mean(0)
98+
y_transformed_ = y - y.mean(0)
99+
indices_include = np.zeros(0, dtype=int) # just an empty array
93100
indices_select = np.zeros(0, dtype=int)
101+
94102
for i in range(n_outputs):
95-
y_i = y[:, i]
96-
batch_split_i = np.diff(
97-
np.r_[
98-
np.arange(n_to_select_split[i], step=batch_size, dtype=int),
99-
n_to_select_split[i],
100-
]
101-
)
102-
for j, batch_size_j in enumerate(batch_split_i):
103-
if j == 0:
104-
selector_j = FastCan(
105-
batch_size_j, indices_exclude=indices_select, verbose=0
106-
).fit(X, y_i)
107-
X_transformed_ = deepcopy(selector_j.X_transformed_)
108-
indices = selector_j.indices_
109-
else:
110-
indices, scores, mask = _prepare_search(
111-
n_features,
112-
batch_size_j,
113-
selector_j.indices_include_,
114-
np.r_[selector_j.indices_exclude_, indices_select],
115-
)
116-
_forward_search(
117-
X=X_transformed_,
118-
V=selector_j.y_transformed_,
119-
t=batch_size_j,
120-
tol=selector_j.tol,
121-
num_threads=n_threads,
122-
verbose=0,
123-
mask=mask,
124-
indices=indices,
125-
scores=scores,
126-
)
103+
y_i = y_transformed_[:, [i]]
104+
n_selected_i = 0
105+
while n_to_select_split[i] > n_selected_i:
106+
batch_size_temp = min(batch_size, n_to_select_split[i] - n_selected_i)
107+
indices, scores, mask = _prepare_search(
108+
n_features,
109+
batch_size_temp,
110+
indices_include,
111+
indices_select,
112+
)
113+
_forward_search(
114+
X=deepcopy(X_transformed_),
115+
V=y_i,
116+
t=batch_size_temp,
117+
tol=0.01,
118+
num_threads=n_threads,
119+
verbose=0,
120+
mask=mask,
121+
indices=indices,
122+
scores=scores,
123+
)
127124
indices_select = np.r_[indices_select, indices]
125+
n_selected_i += batch_size_temp
128126
if verbose == 1:
129127
print(
130128
f"Progress: {indices_select.size}/{n_features_to_select}", end="\r"

meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
project(
22
'fastcan',
33
'c', 'cython',
4-
version: '0.3.1',
4+
version: '0.3.2',
55
license: 'MIT',
66
meson_version: '>= 1.1.0',
77
default_options: [

0 commit comments

Comments
 (0)