|
14 | 14 | def convert_sklearn_one_hot_encoder(scope, operator, container): |
15 | 15 | op = operator.raw_operator |
16 | 16 | C = operator.inputs[0].type.shape[1] |
17 | | - categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0] |
18 | 17 |
|
19 | 18 | # encoded_slot_sizes[i] is the number of output coordinates associated with the ith categorical feature |
20 | 19 | categorical_values_per_feature = [] |
21 | | - |
22 | | - categorical_values_per_feature = [] |
23 | | - for cat in op.categories_: |
24 | | - if cat is None and len(cat) == 0: |
25 | | - continue |
26 | | - if cat.dtype in (numpy.float32, numpy.float64, numpy.int32, numpy.int64): |
27 | | - categorical_values_per_feature.append(list(cat.astype(numpy.int64))) |
28 | | - elif cat.dtype in (numpy.str, numpy.unicode, numpy.object): |
29 | | - categorical_values_per_feature.append([str(_) for _ in cat]) |
| 20 | + if hasattr(op, 'categories_'): |
| 21 | + categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0] |
| 22 | + |
| 23 | + for cat in op.categories_: |
| 24 | + if cat is None and len(cat) == 0: |
| 25 | + continue |
| 26 | + if cat.dtype in (numpy.float32, numpy.float64, numpy.int32, numpy.int64): |
| 27 | + categorical_values_per_feature.append(list(cat.astype(numpy.int64))) |
| 28 | + elif cat.dtype in (numpy.str, numpy.unicode, numpy.object): |
| 29 | + categorical_values_per_feature.append([str(_) for _ in cat]) |
| 30 | + else: |
| 31 | + raise TypeError("Categories must be int or strings not {0}.".format(cat.dtype)) |
| 32 | + else: |
| 33 | + if op.categorical_features == 'all': |
| 34 | + categorical_feature_indices = [i for i in range(C)] |
| 35 | + elif isinstance(op.categorical_features, collections.Iterable): |
| 36 | + if all(isinstance(i, bool) for i in op.categorical_features): |
| 37 | + categorical_feature_indices = [i for i, active in enumerate(op.categorical_features) if active] |
| 38 | + else: |
| 39 | + categorical_feature_indices = [int(i) for i in op.categorical_features] |
| 40 | + else: |
| 41 | + raise ValueError('Unknown operation mode') |
| 42 | + |
| 43 | + if op.n_values == 'auto': |
| 44 | + # Use active feature to determine output length |
| 45 | + for i in range(len(op.feature_indices_) - 1): |
| 46 | + allowed_values = [] |
| 47 | + index_head = op.feature_indices_[i] |
| 48 | + index_tail = op.feature_indices_[i + 1] # feature indexed by index_tail not included in this category |
| 49 | + for j in op.active_features_: |
| 50 | + if index_head <= j and j < index_tail: |
| 51 | + allowed_values.append(j - index_head) |
| 52 | + categorical_values_per_feature.append(allowed_values) |
| 53 | + elif isinstance(op.n_values, numbers.Integral): |
| 54 | + # Each categorical feature will be mapped to a fixed length one-hot sub-vector |
| 55 | + for i in range(len(op.feature_indices_) - 1): |
| 56 | + index_head = op.feature_indices_[i] |
| 57 | + categorical_values_per_feature.append(list(i - index_head for i in range(op.n_values))) |
30 | 58 | else: |
31 | | - raise TypeError("Categories must be int or strings not {0}.".format(cat.dtype)) |
| 59 | + # Each categorical feature has its own sub-vector length |
| 60 | + for max_index in op.n_values: |
| 61 | + categorical_values_per_feature.append(list(i for i in range(max_index))) |
32 | 62 |
|
33 | 63 | # Variable names produced by one-hot encoders. Each of them is the encoding result of a categorical feature. |
34 | 64 | final_variable_names = [] |
|
0 commit comments