11
11
from sklearn .utils .validation import check_X_y
12
12
13
13
from ._cancorr_fast import _forward_search # type: ignore
14
- from ._fastcan import FastCan , _prepare_search
14
+ from ._fastcan import _prepare_search
15
15
16
16
17
17
@validate_params (
26
26
],
27
27
"verbose" : ["verbose" ],
28
28
},
29
- prefer_skip_nested_validation = False ,
29
+ prefer_skip_nested_validation = True ,
30
30
)
31
31
def minibatch (X , y , n_features_to_select = 1 , batch_size = 1 , verbose = 1 ):
32
32
"""Feature selection using :class:`fastcan.FastCan` with mini batches.
33
33
34
34
It is suitable for selecting a very large number of features
35
35
even larger than the number of samples.
36
36
37
- Similar to the correlation filter which selects each feature without considering
38
- the redundancy, the function selects features in mini-batch and the
39
- redundancy between the two mini-batches will be ignored.
37
+ The function splits `n_features_to_select` into `n_outputs` parts and selects
38
+ features for each part separately, ignoring the redundancy among outputs.
39
+ In each part, the function selects features batch-by-batch. The batch size is less
40
+ than or equal to `batch_size`.
41
+ Like correlation filters, which select features one-by-one without considering
42
+ the redundancy between two features, the function ignores the redundancy between
43
+ two mini-batches.
40
44
41
45
Parameters
42
46
----------
@@ -70,7 +74,7 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, verbose=1):
70
74
>>> print(f"Indices: {indices}")
71
75
Indices: [0 1 2]
72
76
"""
73
- X , y = check_X_y (X , y , ensure_2d = True , multi_output = True )
77
+ X , y = check_X_y (X , y , ensure_2d = True , multi_output = True , order = "F" )
74
78
if y .ndim == 1 :
75
79
y = y .reshape (- 1 , 1 )
76
80
@@ -90,41 +94,35 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, verbose=1):
90
94
0 , n_features_to_select , num = n_outputs + 1 , endpoint = True , dtype = int
91
95
)
92
96
)
97
+ X_transformed_ = X - X .mean (0 )
98
+ y_transformed_ = y - y .mean (0 )
99
+ indices_include = np .zeros (0 , dtype = int ) # just an empty array
93
100
indices_select = np .zeros (0 , dtype = int )
101
+
94
102
for i in range (n_outputs ):
95
- y_i = y [:, i ]
96
- batch_split_i = np .diff (
97
- np .r_ [
98
- np .arange (n_to_select_split [i ], step = batch_size , dtype = int ),
99
- n_to_select_split [i ],
100
- ]
101
- )
102
- for j , batch_size_j in enumerate (batch_split_i ):
103
- if j == 0 :
104
- selector_j = FastCan (
105
- batch_size_j , indices_exclude = indices_select , verbose = 0
106
- ).fit (X , y_i )
107
- X_transformed_ = deepcopy (selector_j .X_transformed_ )
108
- indices = selector_j .indices_
109
- else :
110
- indices , scores , mask = _prepare_search (
111
- n_features ,
112
- batch_size_j ,
113
- selector_j .indices_include_ ,
114
- np .r_ [selector_j .indices_exclude_ , indices_select ],
115
- )
116
- _forward_search (
117
- X = X_transformed_ ,
118
- V = selector_j .y_transformed_ ,
119
- t = batch_size_j ,
120
- tol = selector_j .tol ,
121
- num_threads = n_threads ,
122
- verbose = 0 ,
123
- mask = mask ,
124
- indices = indices ,
125
- scores = scores ,
126
- )
103
+ y_i = y_transformed_ [:, [i ]]
104
+ n_selected_i = 0
105
+ while n_to_select_split [i ] > n_selected_i :
106
+ batch_size_temp = min (batch_size , n_to_select_split [i ] - n_selected_i )
107
+ indices , scores , mask = _prepare_search (
108
+ n_features ,
109
+ batch_size_temp ,
110
+ indices_include ,
111
+ indices_select ,
112
+ )
113
+ _forward_search (
114
+ X = deepcopy (X_transformed_ ),
115
+ V = y_i ,
116
+ t = batch_size_temp ,
117
+ tol = 0.01 ,
118
+ num_threads = n_threads ,
119
+ verbose = 0 ,
120
+ mask = mask ,
121
+ indices = indices ,
122
+ scores = scores ,
123
+ )
127
124
indices_select = np .r_ [indices_select , indices ]
125
+ n_selected_i += batch_size_temp
128
126
if verbose == 1 :
129
127
print (
130
128
f"Progress: { indices_select .size } /{ n_features_to_select } " , end = "\r "
0 commit comments