|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +from category_encoders import OrdinalEncoder |
| 4 | +import category_encoders.utils as util |
| 5 | + |
| 6 | + |
| 7 | +class RankHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): |
| 8 | + """The rank-hot encoder is similar to a one-hot encoder, |
| 9 | + except every feature up to and including the current rank is hot. |
| 10 | + This is also called thermometer encoding. |
| 11 | +
|
| 12 | + Parameters |
| 13 | + ---------- |
| 14 | +
|
| 15 | + verbose: int |
| 16 | + integer indicating verbosity of the output. 0 for none. |
| 17 | + cols: list |
| 18 | + a list of columns to encode, if None, all string columns will be encoded. |
| 19 | + drop_invariant: bool |
| 20 | + boolean for whether or not to drop columns with 0 variance. |
| 21 | + use_cat_names: bool |
| 22 | + if True, category values will be included in the encoded column names. |
| 23 | + Since this can result in duplicate column names, |
| 24 | + duplicates are suffixed with '#' symbol until a unique name is generated. |
| 25 | + If False, category indices will be used instead of the category values. |
| 26 | + handle_unknown: str |
| 27 | + options are 'error', 'value', 'return_nan'. |
| 28 | + The default is 'value'. |
| 29 | + 'value': If an unknown label occurrs, it is represented as 0 array. |
| 30 | + 'error': If an unknown label occurrs, error message is displayed. |
| 31 | + 'return_nan': If an unknown label occurrs, np.nan is returned in all columns. |
| 32 | + handle_missing: str |
| 33 | + options are 'error', 'value' and 'return_nan'. The default is 'value'. |
| 34 | + Missing value also considered as unknown value in the final data set. |
| 35 | +
|
| 36 | + Example |
| 37 | + ------- |
| 38 | + >>> from category_encoders import * |
| 39 | + >>> import pandas as pd |
| 40 | + >>> from sklearn.datasets import fetch_openml |
| 41 | + >>> bunch = fetch_openml(name="house_prices", as_frame=True) |
| 42 | + >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] |
| 43 | + >>> y = bunch.target |
| 44 | + >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] |
| 45 | + >>> enc = RankHotEncoder(cols=['CentralAir', 'Heating'], handle_unknown='indicator').fit(X, y) |
| 46 | + >>> numeric_dataset = enc.transform(X) |
| 47 | + >>> print(numeric_dataset.info()) |
| 48 | + <class 'pandas.core.frame.DataFrame'> |
| 49 | + RangeIndex: 1460 entries, 0 to 1459 |
| 50 | + Data columns (total 13 columns): |
| 51 | + # Column Non-Null Count Dtype |
| 52 | + --- ------ -------------- ----- |
| 53 | + 0 Id 1460 non-null float64 |
| 54 | + 1 MSSubClass 1460 non-null float64 |
| 55 | + 2 MSZoning 1460 non-null object |
| 56 | + 3 LotFrontage 1201 non-null float64 |
| 57 | + 4 YearBuilt 1460 non-null float64 |
| 58 | + 5 Heating_1 1460 non-null int64 |
| 59 | + 6 Heating_2 1460 non-null int64 |
| 60 | + 7 Heating_3 1460 non-null int64 |
| 61 | + 8 Heating_4 1460 non-null int64 |
| 62 | + 9 Heating_5 1460 non-null int64 |
| 63 | + 10 Heating_6 1460 non-null int64 |
| 64 | + 11 CentralAir_1 1460 non-null int64 |
| 65 | + 12 CentralAir_2 1460 non-null int64 |
| 66 | + dtypes: float64(4), int64(8), object(1) |
| 67 | + memory usage: 148.4+ KB |
| 68 | + None |
| 69 | + """ |
| 70 | + |
| 71 | + prefit_ordinal = True |
| 72 | + encoding_relation = util.EncodingRelation.ONE_TO_N_UNIQUE |
| 73 | + |
| 74 | + def __init__( |
| 75 | + self, |
| 76 | + verbose=0, |
| 77 | + cols=None, |
| 78 | + drop_invariant=False, |
| 79 | + return_df=True, |
| 80 | + handle_missing="value", |
| 81 | + handle_unknown="value", |
| 82 | + use_cat_names=None, |
| 83 | + ): |
| 84 | + super().__init__( |
| 85 | + verbose=verbose, |
| 86 | + cols=cols, |
| 87 | + drop_invariant=drop_invariant, |
| 88 | + return_df=return_df, |
| 89 | + handle_unknown=handle_unknown, |
| 90 | + handle_missing=handle_missing, |
| 91 | + ) |
| 92 | + self._dim = None |
| 93 | + self.mapping = None |
| 94 | + self.use_cat_names = use_cat_names |
| 95 | + |
| 96 | + def _fit(self, X, y, **kwargs): |
| 97 | + oe_missing_strat = { |
| 98 | + 'error': 'error', |
| 99 | + 'return_nan': 'return_nan', |
| 100 | + 'value': 'value', |
| 101 | + 'indicator': 'return_nan', |
| 102 | + }[self.handle_missing] |
| 103 | + # supply custom mapping in order to assure order of ordinal variable |
| 104 | + ordered_mapping = [] |
| 105 | + for col in self.cols: |
| 106 | + oe_col = OrdinalEncoder(verbose=self.verbose, cols=[col], handle_unknown="value", handle_missing=oe_missing_strat) |
| 107 | + oe_col.fit(X[col].sort_values().to_frame(name=col)) |
| 108 | + ordered_mapping += oe_col.mapping |
| 109 | + |
| 110 | + self.ordinal_encoder = OrdinalEncoder( |
| 111 | + verbose=self.verbose, cols=self.cols, handle_unknown="value", handle_missing=oe_missing_strat, mapping=ordered_mapping |
| 112 | + ) |
| 113 | + self.ordinal_encoder = self.ordinal_encoder.fit(X) |
| 114 | + |
| 115 | + self.mapping = self.generate_mapping() |
| 116 | + |
| 117 | + return self |
| 118 | + |
| 119 | + def _transform(self, X_in, override_return_df=False): |
| 120 | + X = X_in.copy(deep=True) |
| 121 | + X = self.ordinal_encoder.transform(X) |
| 122 | + input_cols = X.columns.values.tolist() |
| 123 | + |
| 124 | + if self.handle_unknown == "error": |
| 125 | + if X[self.cols].isin([-1]).any().any(): |
| 126 | + raise ValueError("Columns to be encoded can not contain new values") |
| 127 | + |
| 128 | + for switch, ordinal_switch in zip(self.mapping, self.ordinal_encoder.category_mapping): |
| 129 | + col = switch.get("col") |
| 130 | + mod = switch.get("mapping") |
| 131 | + encode_feature_series = X[col] |
| 132 | + |
| 133 | + unknow_elements = encode_feature_series[encode_feature_series == -1] |
| 134 | + |
| 135 | + encoding_dict = {i: list(row.values()) for i, row in mod.to_dict(orient="index").items()} |
| 136 | + if self.handle_unknown == "value": |
| 137 | + default_value = [0] * len(encoding_dict) |
| 138 | + elif self.handle_unknown == "return_nan": |
| 139 | + default_value = [np.nan] * len(encoding_dict) |
| 140 | + elif self.handle_unknown == "error": |
| 141 | + if not unknow_elements.empty: |
| 142 | + unknowns_str = ', '.join([str(x) for x in unknow_elements.unique()]) |
| 143 | + msg = f"Unseen values {unknowns_str} during transform in column {col}." |
| 144 | + raise ValueError(msg) |
| 145 | + default_value = [0] * len(encoding_dict) |
| 146 | + else: |
| 147 | + raise ValueError(f"invalid option for 'handle_unknown' parameter: {self.handle_unknown}") |
| 148 | + |
| 149 | + def apply_coding(row: pd.Series): |
| 150 | + val = row.iloc[0] |
| 151 | + if pd.isna(val): |
| 152 | + if self.handle_missing == "value": |
| 153 | + return default_value |
| 154 | + elif self.handle_missing == "return_nan": |
| 155 | + return [np.nan] * len(default_value) |
| 156 | + else: |
| 157 | + raise ValueError("Unhandled NaN") |
| 158 | + return encoding_dict.get(row.iloc[0], default_value) |
| 159 | + |
| 160 | + encoded = encode_feature_series.to_frame().apply(apply_coding, axis=1, result_type="expand") |
| 161 | + encoded.columns = mod.columns |
| 162 | + |
| 163 | + X = pd.concat([encoded, X], axis=1) |
| 164 | + |
| 165 | + old_column_index = input_cols.index(col) |
| 166 | + input_cols[old_column_index:old_column_index + 1] = mod.columns |
| 167 | + X = X.reindex(columns=input_cols) |
| 168 | + |
| 169 | + return X |
| 170 | + |
| 171 | + def create_dataframe(self, X, encoded, key_col): |
| 172 | + |
| 173 | + if not (isinstance(encoded, pd.DataFrame) or isinstance(encoded, pd.Series)): |
| 174 | + encoded = pd.DataFrame(encoded, columns=key_col) |
| 175 | + |
| 176 | + X_ = pd.concat([encoded, X], axis=1) |
| 177 | + return X_ |
| 178 | + |
| 179 | + def inverse_transform(self, X_in): |
| 180 | + X = X_in.copy(deep=True) |
| 181 | + cols = X.columns.values.tolist() |
| 182 | + if self._dim is None: |
| 183 | + raise ValueError("Must train encoder before it can be used to inverse_transform data") |
| 184 | + |
| 185 | + for switch, ordinal_mapping in zip(self.mapping, self.ordinal_encoder.category_mapping): |
| 186 | + col = switch.get("col") |
| 187 | + cats = switch.get("mapping") |
| 188 | + if col != ordinal_mapping.get("col"): |
| 189 | + raise ValueError("Column order of OrdinalEncoder and RankHotEncoder do not match") |
| 190 | + inv_map = {v: k for k, v in ordinal_mapping.get("mapping").to_dict().items()} |
| 191 | + |
| 192 | + arrs = X[cats.columns] |
| 193 | + reencode = arrs.sum(axis=1).rename(col) |
| 194 | + |
| 195 | + orig_dtype = ordinal_mapping.get("data_type") |
| 196 | + reencode2 = reencode.replace(inv_map).astype(orig_dtype) |
| 197 | + if np.any(reencode2[:] == 0): |
| 198 | + reencode2[reencode2[:] == 0] = np.nan |
| 199 | + |
| 200 | + X = self.create_dataframe(X, reencode2, col) |
| 201 | + |
| 202 | + first_inex = cols.index(cats.columns[0]) |
| 203 | + last_index = cols.index(cats.columns[-1]) + 1 |
| 204 | + |
| 205 | + del cols[first_inex:last_index] |
| 206 | + cols.insert(self.ordinal_encoder.feature_names_out_.index(col), col) |
| 207 | + |
| 208 | + X = X.reindex(columns=cols) |
| 209 | + |
| 210 | + return X |
| 211 | + |
| 212 | + def generate_mapping(self): |
| 213 | + mapping = [] |
| 214 | + found_column_counts = {} |
| 215 | + |
| 216 | + for switch in self.ordinal_encoder.mapping: |
| 217 | + col: str = switch.get("col") |
| 218 | + values: pd.Series = switch.get("mapping").copy(deep=True) |
| 219 | + |
| 220 | + if self.handle_missing == "value": |
| 221 | + values = values[values > 0] |
| 222 | + |
| 223 | + if len(values) == 0: |
| 224 | + continue |
| 225 | + |
| 226 | + index = [] |
| 227 | + new_columns = [] |
| 228 | + |
| 229 | + for cat_name, class_ in values.iteritems(): |
| 230 | + if self.use_cat_names: |
| 231 | + n_col_name = f"{col}_{cat_name}" |
| 232 | + found_count = found_column_counts.get(n_col_name, 0) |
| 233 | + found_column_counts[n_col_name] = found_count + 1 |
| 234 | + n_col_name += "#" * found_count |
| 235 | + else: |
| 236 | + n_col_name = f"{col}_{class_}" |
| 237 | + |
| 238 | + index.append(class_) |
| 239 | + new_columns.append(n_col_name) |
| 240 | + |
| 241 | + base_matrix = np.tril(np.ones((len(index), len(index)), dtype=int)) |
| 242 | + base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index) |
| 243 | + |
| 244 | + mapping.append({"col": col, "mapping": base_df}) |
| 245 | + return mapping |
0 commit comments