|
66 | 66 | """
|
67 | 67 | **Private method.**
|
68 | 68 |
|
69 |
| -Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n |
| 69 | +Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible, |
| 70 | +feat_name_level_0, feat_name_level_1,..., feat_name_level_n |
70 | 71 | """
|
71 |
| -function generate_new_feat_names(feat_name, num_inds, existing_names) |
72 |
| - conflict = true # will be kept true as long as there is a conflict |
73 |
| - count = 1 # number of conflicts+1 = number of underscores |
| 72 | +function generate_new_feat_names( |
| 73 | + feat_name, |
| 74 | + num_inds, |
| 75 | + levels, |
| 76 | + existing_names; |
| 77 | + use_levelnames = true, |
| 78 | +) |
| 79 | + # Convert levels (e.g. KeySet or Tuple) to an indexable vector |
| 80 | + levels_vec = collect(levels) |
| 81 | + |
| 82 | + conflict = true # true while there's a name clash |
| 83 | + count = 1 # number of underscores in the suffix |
| 84 | + new_column_names = Symbol[] |
74 | 85 |
|
75 |
| - new_column_names = [] |
76 | 86 | while conflict
|
77 | 87 | suffix = repeat("_", count)
|
78 |
| - new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds] |
| 88 | + if use_levelnames |
| 89 | + # Always use the first num_inds level names |
| 90 | + new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ] |
| 91 | + else |
| 92 | + # Always use numeric indices |
| 93 | + new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ] |
| 94 | + end |
| 95 | + # Check for collisions |
79 | 96 | conflict = any(name -> name in existing_names, new_column_names)
|
80 | 97 | count += 1
|
81 | 98 | end
|
| 99 | + |
82 | 100 | return new_column_names
|
83 | 101 | end
|
84 | 102 |
|
|
88 | 106 | **Private method.**
|
89 | 107 |
|
90 | 108 | Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
|
91 |
| -a subset of categorical features of X into a scalar or a vector (as specified in single_feat) |
| 109 | +a subset of categorical features of X into a scalar or a vector (as specified in `single_feat`) |
92 | 110 |
|
93 | 111 | - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
|
94 |
| - into a scalar (single_feat=true) |
| 112 | + into a scalar (`single_feat=true`) |
95 | 113 |
|
96 | 114 | - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
|
97 |
| - into a set of k features where k is the length of the vector (single_feat=false) |
| 115 | + into a set of `k` features where `k` is the length of the vector (`single_feat=false`) |
98 | 116 | - In both cases it attempts to preserve the type of the table.
|
99 | 117 | - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
|
100 | 118 | assumption is necessary because any column in X must correspond to a constant number of features
|
101 | 119 | in the output table (which is equal to k).
|
102 | 120 | - Features not in the dictionary are mapped to themselves (i.e., not changed).
|
103 |
| - - Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error. |
104 |
| - - If `ensure_categorical` is true, then any input categorical column will remain categorical |
| 121 | + - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error. |
| 122 | + - use_levelnames: if true, the new feature names are generated using the level names when the transform generates multiple features; |
| 123 | + else they are generated using the indices of the levels. |
| 124 | + - custom_levels: if not `nothing`, then the levels of the categorical features are replaced by the custom_levels |
105 | 125 | """
|
106 | 126 | function generic_transform(
|
107 | 127 | X,
|
108 | 128 | mapping_per_feat_level;
|
109 | 129 | single_feat = true,
|
110 | 130 | ignore_unknown = false,
|
| 131 | + use_levelnames = false, |
| 132 | + custom_levels = nothing, |
111 | 133 | ensure_categorical = false,
|
112 | 134 | )
|
113 | 135 | feat_names = Tables.schema(X).names
|
@@ -149,7 +171,9 @@ function generic_transform(
|
149 | 171 | feat_names_with_inds = generate_new_feat_names(
|
150 | 172 | feat_name,
|
151 | 173 | length(first(mapping_per_feat_level[feat_name])[2]),
|
152 |
| - feat_names, |
| 174 | + (custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels, |
| 175 | + feat_names; |
| 176 | + use_levelnames = use_levelnames, |
153 | 177 | )
|
154 | 178 | push!(new_feat_names, feat_names_with_inds...)
|
155 | 179 | end
|
|
0 commit comments