@@ -49,11 +49,13 @@ function generic_fit(X,
4949 feat_col = Tables. getcolumn (X, feat_name)
5050 feat_type = elscitype (feat_col)
5151 feat_has_allowed_type =
52- feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor} )
52+ feat_type <: Union{Missing, Multiclass} ||
53+ (ordered_factor && feat_type <: Union{Missing, OrderedFactor} )
5354 if feat_has_allowed_type # then should be encoded
5455 push! (encoded_features, feat_name)
5556 # Compute the dict using the given feature_mapper function
56- mapping_per_feat_level[feat_name] = feature_mapper (feat_col, feat_name, args... ; kwargs... )
57+ mapping_per_feat_level[feat_name] =
58+ feature_mapper (feat_col, feat_name, args... ; kwargs... )
5759 end
5860 end
5961 return mapping_per_feat_level, encoded_features
@@ -72,7 +74,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names)
7274
7375 new_column_names = []
7476 while conflict
75- suffix = repeat (" _" , count)
77+ suffix = repeat (" _" , count)
7678 new_column_names = [Symbol (" $(feat_name)$(suffix)$i " ) for i in 1 : num_inds]
7779 conflict = any (name -> name in existing_names, new_column_names)
7880 count += 1
8587"""
8688**Private method.**
8789
88- Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
90+ Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
8991a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
9092
91- - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
92- into a scalar (single_feat=true)
93+ - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
94+ into a scalar (single_feat=true)
9395
94- - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
95- into a set of k features where k is the length of the vector (single_feat=false)
96+ - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
97+ into a set of k features where k is the length of the vector (single_feat=false)
9698 - In both cases it attempts to preserve the type of the table.
9799 - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
98- assumption is necessary because any column in X must correspond to a constant number of features
100+ assumption is necessary because any column in X must correspond to a constant number of features
99101 in the output table (which is equal to k).
100102 - Features not in the dictionary are mapped to themselves (i.e., not changed).
101- - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
103+ - Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
104+ - If `ensure_categorical` is true, then any input categorical column will remain categorical
102105"""
103- function generic_transform (X, mapping_per_feat_level; single_feat = true , ignore_unknown = false )
106+ function generic_transform (
107+ X,
108+ mapping_per_feat_level;
109+ single_feat = true ,
110+ ignore_unknown = false ,
111+ ensure_categorical = false ,
112+ )
104113 feat_names = Tables. schema (X). names
105114 new_feat_names = Symbol[]
106115 new_cols = []
@@ -115,18 +124,25 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
115124 if ! issubset (test_levels, train_levels)
116125 # get the levels in test that are not in train
117126 lost_levels = setdiff (test_levels, train_levels)
118- error (" While transforming, found novel levels for the column $(feat_name) : $(lost_levels) that were not seen while training." )
127+ error (
128+ " While transforming, found novel levels for the column $(feat_name) : $(lost_levels) that were not seen while training." ,
129+ )
119130 end
120131 end
121-
132+
122133 if single_feat
123134 level2scalar = mapping_per_feat_level[feat_name]
124- new_col = ! isempty (level2scalar) ? recode (col, level2scalar... ) : col
135+ if ensure_categorical
136+ new_col = ! isempty (level2scalar) ? recode (col, level2scalar... ) : col
137+ else
138+ new_col = ! isempty (level2scalar) ? unwrap .(recode (col, level2scalar... )) : col
139+ end
140+
125141 push! (new_cols, new_col)
126142 push! (new_feat_names, feat_name)
127143 else
128144 level2vector = mapping_per_feat_level[feat_name]
129- new_multi_col = map (x-> get (level2vector, x, x), col)
145+ new_multi_col = map (x -> get (level2vector, x, x), col)
130146 new_multi_col = [col for col in eachrow (hcat (new_multi_col... ))]
131147 push! (new_cols, new_multi_col... )
132148
@@ -144,8 +160,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
144160 end
145161 end
146162
147- transformed_X= NamedTuple {tuple(new_feat_names...)} (tuple (new_cols)... )
163+ transformed_X = NamedTuple {tuple(new_feat_names...)} (tuple (new_cols)... )
148164 # Attempt to preserve table type
149165 transformed_X = Tables. materializer (X)(transformed_X)
150166 return transformed_X
151- end
167+ end
0 commit comments