|
| 1 | +from collections import defaultdict |
1 | 2 | from typing import List, Union |
2 | 3 |
|
3 | 4 | import pandas as pd |
@@ -99,7 +100,6 @@ def __init__( |
99 | 100 | missing_values: str = "ignore", |
100 | 101 | confirm_variables: bool = False, |
101 | 102 | ): |
102 | | - |
103 | 103 | if missing_values not in ["raise", "ignore"]: |
104 | 104 | raise ValueError("missing_values takes only values 'raise' or 'ignore'.") |
105 | 105 |
|
@@ -136,42 +136,30 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None): |
136 | 136 | # check if dataset contains na |
137 | 137 | _check_contains_na(X, self.variables_) |
138 | 138 |
|
139 | | - # create tuples of duplicated feature groups |
140 | | - self.duplicated_feature_sets_ = [] |
141 | | - |
142 | | - # set to collect features that are duplicated |
143 | | - self.features_to_drop_ = set() # type: ignore |
144 | | - |
145 | | - # create set of examined features |
146 | | - _examined_features = set() |
147 | | - |
148 | | - for feature in self.variables_: |
| 139 | + # collect duplicate features |
| 140 | + _features_hashmap = defaultdict(list) |
149 | 141 |
|
150 | | - # append so we can remove when we create the combinations |
151 | | - _examined_features.add(feature) |
| 142 | + # hash the features |
| 143 | + _X_hash = pd.util.hash_pandas_object(X[self.variables_].T, index=False) |
152 | 144 |
|
153 | | - if feature not in self.features_to_drop_: |
| 145 | + # group the features by hash |
| 146 | + for feature, feature_hash in _X_hash.items(): |
| 147 | + _features_hashmap[feature_hash].append(feature) |
154 | 148 |
|
155 | | - _temp_set = set([feature]) |
156 | | - |
157 | | - # features that have not been examined, are not currently examined and |
158 | | - # were not found duplicates |
159 | | - _features_to_compare = [ |
160 | | - f |
161 | | - for f in self.variables_ |
162 | | - if f not in _examined_features.union(self.features_to_drop_) |
163 | | - ] |
164 | | - |
165 | | - # create combinations: |
166 | | - for f2 in _features_to_compare: |
167 | | - |
168 | | - if X[feature].equals(X[f2]): |
169 | | - self.features_to_drop_.add(f2) |
170 | | - _temp_set.add(f2) |
| 149 | + # create tuples of duplicated feature groups |
| 150 | + self.duplicated_feature_sets_ = [ |
| 151 | + set(duplicate) |
| 152 | + for duplicate in _features_hashmap.values() |
| 153 | + if len(duplicate) > 1 |
| 154 | + ] |
171 | 155 |
|
172 | | - # if there are duplicated features |
173 | | - if len(_temp_set) > 1: |
174 | | - self.duplicated_feature_sets_.append(_temp_set) |
| 156 | + # set to collect features that are duplicated |
| 157 | + self.features_to_drop_ = { |
| 158 | + item |
| 159 | + for duplicates in _features_hashmap.values() |
| 160 | + for item in duplicates[1:] |
| 161 | + if duplicates and len(duplicates) > 1 |
| 162 | + } |
175 | 163 |
|
176 | 164 | # save input features |
177 | 165 | self._get_feature_names_in(X) |
|
0 commit comments