@@ -49,11 +49,13 @@ function generic_fit(X,
4949 feat_col = Tables. getcolumn (X, feat_name)
5050 feat_type = elscitype (feat_col)
5151 feat_has_allowed_type =
52- feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor} )
52+ feat_type <: Union{Missing, Multiclass} ||
53+ (ordered_factor && feat_type <: Union{Missing, OrderedFactor} )
5354 if feat_has_allowed_type # then should be encoded
5455 push! (encoded_features, feat_name)
5556 # Compute the dict using the given feature_mapper function
56- mapping_per_feat_level[feat_name] = feature_mapper (feat_col, feat_name, args... ; kwargs... )
57+ mapping_per_feat_level[feat_name] =
58+ feature_mapper (feat_col, feat_name, args... ; kwargs... )
5759 end
5860 end
5961 return mapping_per_feat_level, encoded_features
6466"""
6567**Private method.**
6668
67- Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n
69+ Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible,
70+ feat_name_level_0, feat_name_level_1,..., feat_name_level_n
6871"""
69- function generate_new_feat_names (feat_name, num_inds, existing_names)
70- conflict = true # will be kept true as long as there is a conflict
71- count = 1 # number of conflicts+1 = number of underscores
72+ function generate_new_feat_names (
73+ feat_name,
74+ num_inds,
75+ levels,
76+ existing_names;
77+ use_levelnames = true ,
78+ )
79+ # Convert levels (e.g. KeySet or Tuple) to an indexable vector
80+ levels_vec = collect (levels)
81+
82+ conflict = true # true while there's a name clash
83+ count = 1 # number of underscores in the suffix
84+ new_column_names = Symbol[]
7285
73- new_column_names = []
7486 while conflict
75- suffix = repeat (" _" , count)
76- new_column_names = [Symbol (" $(feat_name)$(suffix)$i " ) for i in 1 : num_inds]
87+ suffix = repeat (" _" , count)
88+ if use_levelnames
89+ # Always use the first num_inds level names
90+ new_column_names = [ Symbol (" $(feat_name)$(suffix)$(levels_vec[i]) " ) for i in 1 : num_inds ]
91+ else
92+ # Always use numeric indices
93+ new_column_names = [ Symbol (" $(feat_name)$(suffix)$i " ) for i in 1 : num_inds ]
94+ end
95+ # Check for collisions
7796 conflict = any (name -> name in existing_names, new_column_names)
7897 count += 1
7998 end
99+
80100 return new_column_names
81101end
82102
85105"""
86106**Private method.**
87107
88- Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
108+ Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
89109a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
90110
91- - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
92- into a scalar (single_feat=true)
111+ - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
112+ into a scalar (single_feat=true)
93113
94- - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
95- into a set of k features where k is the length of the vector (single_feat=false)
114+ - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
115+ into a set of k features where k is the length of the vector (single_feat=false)
96116 - In both cases it attempts to preserve the type of the table.
97117 - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
98- assumption is necessary because any column in X must correspond to a constant number of features
118+ assumption is necessary because any column in X must correspond to a constant number of features
99119 in the output table (which is equal to k).
100120 - Features not in the dictionary are mapped to themselves (i.e., not changed).
101121 - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
122+ - use_levelnames: if true, the new feature names are generated using the level names when the transform generates multiple features;
123+ else they are generated using the indices of the levels.
124+ - custom_levels: if not nothing, then the levels of the categorical features are replaced by the custom_levels
102125"""
103- function generic_transform (X, mapping_per_feat_level; single_feat = true , ignore_unknown = false )
126+ function generic_transform (
127+ X,
128+ mapping_per_feat_level;
129+ single_feat = true ,
130+ ignore_unknown = false ,
131+ use_levelnames = false ,
132+ custom_levels = nothing ,
133+ )
104134 feat_names = Tables. schema (X). names
105135 new_feat_names = Symbol[]
106136 new_cols = []
@@ -115,25 +145,29 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
115145 if ! issubset (test_levels, train_levels)
116146 # get the levels in test that are not in train
117147 lost_levels = setdiff (test_levels, train_levels)
118- error (" While transforming, found novel levels for the column $(feat_name) : $(lost_levels) that were not seen while training." )
148+ error (
149+ " While transforming, found novel levels for the column $(feat_name) : $(lost_levels) that were not seen while training." ,
150+ )
119151 end
120152 end
121-
153+
122154 if single_feat
123155 level2scalar = mapping_per_feat_level[feat_name]
124156 new_col = ! isempty (level2scalar) ? recode (col, level2scalar... ) : col
125157 push! (new_cols, new_col)
126158 push! (new_feat_names, feat_name)
127159 else
128160 level2vector = mapping_per_feat_level[feat_name]
129- new_multi_col = map (x-> get (level2vector, x, x), col)
161+ new_multi_col = map (x -> get (level2vector, x, x), col)
130162 new_multi_col = [col for col in eachrow (hcat (new_multi_col... ))]
131163 push! (new_cols, new_multi_col... )
132164
133165 feat_names_with_inds = generate_new_feat_names (
134166 feat_name,
135167 length (first (mapping_per_feat_level[feat_name])[2 ]),
136- feat_names,
168+ (custom_levels === nothing ) ? keys (mapping_per_feat_level[feat_name]) : custom_levels,
169+ feat_names;
170+ use_levelnames = use_levelnames,
137171 )
138172 push! (new_feat_names, feat_names_with_inds... )
139173 end
@@ -144,8 +178,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
144178 end
145179 end
146180
147- transformed_X= NamedTuple {tuple(new_feat_names...)} (tuple (new_cols)... )
181+ transformed_X = NamedTuple {tuple(new_feat_names...)} (tuple (new_cols)... )
148182 # Attempt to preserve table type
149183 transformed_X = Tables. materializer (X)(transformed_X)
150184 return transformed_X
151- end
185+ end
0 commit comments