Skip to content

Commit 4e42fa3

Browse files
committed
MNT change extend to minibatch to split columns of both X and y into batches
1 parent b53076a commit 4e42fa3

File tree

9 files changed

+200
-213
lines changed

9 files changed

+200
-213
lines changed

.readthedocs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ version: 2
22
build:
33
os: ubuntu-22.04
44
tools:
5-
python: "3.12"
5+
python: "3.13"
66

77
sphinx:
88
configuration: doc/conf.py

doc/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ API Reference
1919

2020
FastCan
2121
refine
22-
extend
22+
minibatch
2323
ssc
2424
ols
2525
make_poly_ids

fastcan/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
The :mod:`fastcan` module implements algorithms, including
33
"""
44

5-
from ._extend import extend
65
from ._fastcan import FastCan
6+
from ._minibatch import minibatch
77
from ._narx import (
88
Narx,
99
make_narx,
@@ -21,7 +21,7 @@
2121
"ssc",
2222
"ols",
2323
"refine",
24-
"extend",
24+
"minibatch",
2525
"make_narx",
2626
"print_narx",
2727
"Narx",

fastcan/_extend.py

Lines changed: 0 additions & 120 deletions
This file was deleted.

fastcan/_fastcan.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,6 @@ def _get_support_mask(self):
287287

288288

289289
def _prepare_search(n_features, n_features_to_select, indices_include, indices_exclude):
290-
""" """
291290
# initiated with -1
292291
indices = np.full(n_features_to_select, -1, dtype=np.intc, order="F")
293292
indices[: indices_include.size] = indices_include

fastcan/_minibatch.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
"""
2+
Feature selection with mini-batch
3+
"""
4+
5+
from copy import deepcopy
6+
from numbers import Integral
7+
8+
import numpy as np
9+
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
10+
from sklearn.utils._param_validation import Interval, validate_params
11+
from sklearn.utils.validation import check_X_y
12+
13+
from ._cancorr_fast import _forward_search # type: ignore
14+
from ._fastcan import FastCan, _prepare_search
15+
16+
17+
@validate_params(
18+
{
19+
"X": ["array-like"],
20+
"y": ["array-like"],
21+
"n_features_to_select": [
22+
Interval(Integral, 1, None, closed="left"),
23+
],
24+
"batch_size": [
25+
Interval(Integral, 1, None, closed="left"),
26+
],
27+
},
28+
prefer_skip_nested_validation=False,
29+
)
30+
def minibatch(X, y, n_features_to_select=1, batch_size=1):
31+
"""FastCan selection with mini batches.
32+
33+
It is suitable for selecting a very large number of features
34+
even larger than the number of samples.
35+
36+
Similar to the correlation filter which selects each feature without considering
37+
the redundancy, the function selects features in mini-batch and the
38+
redundancy between the two mini-batches will be ignored.
39+
40+
Parameters
41+
----------
42+
X : array-like of shape (n_samples, n_features)
43+
Feature matrix.
44+
45+
y : array-like of shape (n_samples, n_outputs)
46+
Target matrix.
47+
48+
n_features_to_select : int, default=1
49+
The parameter is the absolute number of features to select.
50+
51+
batch_size : int, default=1
52+
The number of features in a mini-batch.
53+
54+
Returns
55+
-------
56+
indices : ndarray of shape (n_features_to_select,), dtype=int
57+
The indices of the selected features.
58+
59+
Examples
60+
--------
61+
>>> from fastcan import minibatch
62+
>>> X = [[1, 1, 0], [0.01, 0, 0], [-1, 0, 1], [0, 0, 0]]
63+
>>> y = [1, 0, -1, 0]
64+
>>> indices = minibatch(X, y, 3, batch_size=2)
65+
>>> print(f"Indices: {indices}")
66+
Indices: [0 1 2]
67+
"""
68+
X, y = check_X_y(X, y, ensure_2d=True, multi_output=True)
69+
if y.ndim == 1:
70+
y = y.reshape(-1, 1)
71+
72+
n_features = X.shape[1]
73+
n_outputs = y.shape[1]
74+
75+
if n_features_to_select > n_features:
76+
raise ValueError(
77+
f"n_features_to_select {n_features_to_select} "
78+
f"must be <= n_features {n_features}."
79+
)
80+
81+
n_threads = _openmp_effective_n_threads()
82+
83+
output_arange = np.r_[np.arange(n_outputs, step=batch_size, dtype=int), n_outputs]
84+
n_to_select_split = np.diff(
85+
np.linspace(
86+
0, n_features_to_select, num=output_arange.size, endpoint=True, dtype=int
87+
)
88+
)
89+
indices_select = np.zeros(0, dtype=int)
90+
for i in range(n_to_select_split.size):
91+
y_i = y[:, output_arange[i] : output_arange[i + 1]]
92+
batch_split_i = np.diff(
93+
np.r_[
94+
np.arange(n_to_select_split[i], step=batch_size, dtype=int),
95+
n_to_select_split[i],
96+
]
97+
)
98+
for j, batch_size_j in enumerate(batch_split_i):
99+
if j == 0:
100+
selector_j = FastCan(
101+
batch_size_j, indices_exclude=indices_select, verbose=0
102+
).fit(X, y_i)
103+
X_transformed_ = deepcopy(selector_j.X_transformed_)
104+
indices = selector_j.indices_
105+
else:
106+
indices, scores, mask = _prepare_search(
107+
n_features,
108+
batch_size_j,
109+
selector_j.indices_include_,
110+
np.r_[selector_j.indices_exclude_, indices_select],
111+
)
112+
_forward_search(
113+
X=X_transformed_,
114+
V=selector_j.y_transformed_,
115+
t=batch_size_j,
116+
tol=selector_j.tol,
117+
num_threads=n_threads,
118+
verbose=0,
119+
mask=mask,
120+
indices=indices,
121+
scores=scores,
122+
)
123+
indices_select = np.r_[indices_select, indices]
124+
return indices_select

fastcan/_narx.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -453,8 +453,8 @@ def fit(self, X, y, coef_init=None, **params):
453453
# fit a one-step-ahead Narx model
454454
xy_hstack = np.c_[X, y]
455455
osa_narx = LinearRegression()
456-
time_shift_terms = make_time_shift_features(xy_hstack, self.time_shift_ids_)
457-
poly_terms = make_poly_features(time_shift_terms, self.poly_ids_)
456+
time_shift_vars = make_time_shift_features(xy_hstack, self.time_shift_ids_)
457+
poly_terms = make_poly_features(time_shift_vars, self.poly_ids_)
458458

459459
osa_narx.fit(poly_terms, y)
460460
if coef_init is None:
@@ -644,6 +644,7 @@ def print_narx(
644644
| X[k-0,9] | 68 |
645645
"""
646646
check_is_fitted(narx)
647+
647648
def _get_variable_str(time_shift_id):
648649
if time_shift_id[0] < narx.n_features_in_:
649650
variable_str = f"X[k-{time_shift_id[1]},{time_shift_id[0]}]"
@@ -822,13 +823,13 @@ def make_narx(
822823
),
823824
0,
824825
)
825-
time_shift_terms = make_time_shift_features(xy_hstack, time_shift_ids_all)
826+
time_shift_vars = make_time_shift_features(xy_hstack, time_shift_ids_all)
826827

827828
poly_ids_all = make_poly_ids(
828829
time_shift_ids_all.shape[0],
829830
poly_degree,
830831
)
831-
poly_terms = make_poly_features(time_shift_terms, poly_ids_all)
832+
poly_terms = make_poly_features(time_shift_vars, poly_ids_all)
832833

833834
csf = FastCan(
834835
n_features_to_select,

0 commit comments

Comments
 (0)