1111from sklearn .utils .validation import check_X_y
1212
1313from ._cancorr_fast import _forward_search # type: ignore
14- from ._fastcan import FastCan , _prepare_search
14+ from ._fastcan import _prepare_search
1515
1616
1717@validate_params (
2626 ],
2727 "verbose" : ["verbose" ],
2828 },
29- prefer_skip_nested_validation = False ,
29+ prefer_skip_nested_validation = True ,
3030)
3131def minibatch (X , y , n_features_to_select = 1 , batch_size = 1 , verbose = 1 ):
3232 """Feature selection using :class:`fastcan.FastCan` with mini batches.
3333
3434 It is suitable for selecting a very large number of features
3535 even larger than the number of samples.
3636
37- Similar to the correlation filter which selects each feature without considering
38- the redundancy, the function selects features in mini-batch and the
39- redundancy between the two mini-batches will be ignored.
37+ The function splits `n_features_to_select` into `n_outputs` parts and selects
38+ features for each part separately, ignoring the redundancy among outputs.
39+ In each part, the function selects features batch-by-batch. The batch size is less
40+ than or equal to `batch_size`.
41+ Like correlation filters, which select features one-by-one without considering
42+ the redundancy between two features, the function ignores the redundancy between
43+ two mini-batches.
4044
4145 Parameters
4246 ----------
@@ -70,7 +74,7 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, verbose=1):
7074 >>> print(f"Indices: {indices}")
7175 Indices: [0 1 2]
7276 """
73- X , y = check_X_y (X , y , ensure_2d = True , multi_output = True )
77+ X , y = check_X_y (X , y , ensure_2d = True , multi_output = True , order = "F" )
7478 if y .ndim == 1 :
7579 y = y .reshape (- 1 , 1 )
7680
@@ -90,41 +94,35 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, verbose=1):
9094 0 , n_features_to_select , num = n_outputs + 1 , endpoint = True , dtype = int
9195 )
9296 )
97+ X_transformed_ = X - X .mean (0 )
98+ y_transformed_ = y - y .mean (0 )
99+ indices_include = np .zeros (0 , dtype = int ) # just an empty array
93100 indices_select = np .zeros (0 , dtype = int )
101+
94102 for i in range (n_outputs ):
95- y_i = y [:, i ]
96- batch_split_i = np .diff (
97- np .r_ [
98- np .arange (n_to_select_split [i ], step = batch_size , dtype = int ),
99- n_to_select_split [i ],
100- ]
101- )
102- for j , batch_size_j in enumerate (batch_split_i ):
103- if j == 0 :
104- selector_j = FastCan (
105- batch_size_j , indices_exclude = indices_select , verbose = 0
106- ).fit (X , y_i )
107- X_transformed_ = deepcopy (selector_j .X_transformed_ )
108- indices = selector_j .indices_
109- else :
110- indices , scores , mask = _prepare_search (
111- n_features ,
112- batch_size_j ,
113- selector_j .indices_include_ ,
114- np .r_ [selector_j .indices_exclude_ , indices_select ],
115- )
116- _forward_search (
117- X = X_transformed_ ,
118- V = selector_j .y_transformed_ ,
119- t = batch_size_j ,
120- tol = selector_j .tol ,
121- num_threads = n_threads ,
122- verbose = 0 ,
123- mask = mask ,
124- indices = indices ,
125- scores = scores ,
126- )
103+ y_i = y_transformed_ [:, [i ]]
104+ n_selected_i = 0
105+ while n_to_select_split [i ] > n_selected_i :
106+ batch_size_temp = min (batch_size , n_to_select_split [i ] - n_selected_i )
107+ indices , scores , mask = _prepare_search (
108+ n_features ,
109+ batch_size_temp ,
110+ indices_include ,
111+ indices_select ,
112+ )
113+ _forward_search (
114+ X = deepcopy (X_transformed_ ),
115+ V = y_i ,
116+ t = batch_size_temp ,
117+ tol = 0.01 ,
118+ num_threads = n_threads ,
119+ verbose = 0 ,
120+ mask = mask ,
121+ indices = indices ,
122+ scores = scores ,
123+ )
127124 indices_select = np .r_ [indices_select , indices ]
125+ n_selected_i += batch_size_temp
128126 if verbose == 1 :
129127 print (
130128 f"Progress: { indices_select .size } /{ n_features_to_select } " , end = "\r "
0 commit comments