@@ -49,11 +49,13 @@ function generic_fit(X,
49
49
feat_col = Tables. getcolumn (X, feat_name)
50
50
feat_type = elscitype (feat_col)
51
51
feat_has_allowed_type =
52
- feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor} )
52
+ feat_type <: Union{Missing, Multiclass} ||
53
+ (ordered_factor && feat_type <: Union{Missing, OrderedFactor} )
53
54
if feat_has_allowed_type # then should be encoded
54
55
push! (encoded_features, feat_name)
55
56
# Compute the dict using the given feature_mapper function
56
- mapping_per_feat_level[feat_name] = feature_mapper (feat_col, feat_name, args... ; kwargs... )
57
+ mapping_per_feat_level[feat_name] =
58
+ feature_mapper (feat_col, feat_name, args... ; kwargs... )
57
59
end
58
60
end
59
61
return mapping_per_feat_level, encoded_features
@@ -72,7 +74,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names)
72
74
73
75
new_column_names = []
74
76
while conflict
75
- suffix = repeat (" _" , count)
77
+ suffix = repeat (" _" , count)
76
78
new_column_names = [Symbol (" $(feat_name)$(suffix)$i " ) for i in 1 : num_inds]
77
79
conflict = any (name -> name in existing_names, new_column_names)
78
80
count += 1
85
87
"""
86
88
**Private method.**
87
89
88
- Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
90
+ Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
89
91
a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
90
92
91
- - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
92
- into a scalar (single_feat=true)
93
+ - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
94
+ into a scalar (single_feat=true)
93
95
94
- - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
95
- into a set of k features where k is the length of the vector (single_feat=false)
96
+ - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
97
+ into a set of k features where k is the length of the vector (single_feat=false)
96
98
- In both cases it attempts to preserve the type of the table.
97
99
- In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
98
- assumption is necessary because any column in X must correspond to a constant number of features
100
+ assumption is necessary because any column in X must correspond to a constant number of features
99
101
in the output table (which is equal to k).
100
102
- Features not in the dictionary are mapped to themselves (i.e., not changed).
101
- - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
103
+ - Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
104
+ - If `ensure_categorical` is true, then any input categorical column will remain categorical
102
105
"""
103
- function generic_transform (X, mapping_per_feat_level; single_feat = true , ignore_unknown = false )
106
+ function generic_transform (
107
+ X,
108
+ mapping_per_feat_level;
109
+ single_feat = true ,
110
+ ignore_unknown = false ,
111
+ ensure_categorical = false ,
112
+ )
104
113
feat_names = Tables. schema (X). names
105
114
new_feat_names = Symbol[]
106
115
new_cols = []
@@ -115,18 +124,25 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
115
124
if ! issubset (test_levels, train_levels)
116
125
# get the levels in test that are not in train
117
126
lost_levels = setdiff (test_levels, train_levels)
118
- error (" While transforming, found novel levels for the column $(feat_name) : $(lost_levels) that were not seen while training." )
127
+ error (
128
+ " While transforming, found novel levels for the column $(feat_name) : $(lost_levels) that were not seen while training." ,
129
+ )
119
130
end
120
131
end
121
-
132
+
122
133
if single_feat
123
134
level2scalar = mapping_per_feat_level[feat_name]
124
- new_col = ! isempty (level2scalar) ? recode (col, level2scalar... ) : col
135
+ if ensure_categorical
136
+ new_col = ! isempty (level2scalar) ? recode (col, level2scalar... ) : col
137
+ else
138
+ new_col = ! isempty (level2scalar) ? unwrap .(recode (col, level2scalar... )) : col
139
+ end
140
+
125
141
push! (new_cols, new_col)
126
142
push! (new_feat_names, feat_name)
127
143
else
128
144
level2vector = mapping_per_feat_level[feat_name]
129
- new_multi_col = map (x-> get (level2vector, x, x), col)
145
+ new_multi_col = map (x -> get (level2vector, x, x), col)
130
146
new_multi_col = [col for col in eachrow (hcat (new_multi_col... ))]
131
147
push! (new_cols, new_multi_col... )
132
148
@@ -144,8 +160,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
144
160
end
145
161
end
146
162
147
- transformed_X= NamedTuple {tuple(new_feat_names...)} (tuple (new_cols)... )
163
+ transformed_X = NamedTuple {tuple(new_feat_names...)} (tuple (new_cols)... )
148
164
# Attempt to preserve table type
149
165
transformed_X = Tables. materializer (X)(transformed_X)
150
166
return transformed_X
151
- end
167
+ end
0 commit comments