Skip to content

Commit a815b76

Browse files
committed
✨ Support for using level names instead of indices for new columns in transform
1 parent 14a5671 commit a815b76

File tree

3 files changed

+84
-32
lines changed

3 files changed

+84
-32
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ meh/*.ipynb
2727
.DS_Store
2828
/*.jl
2929
scratchpad/
30+
examples/test.jl

src/generic.jl

Lines changed: 56 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,13 @@ function generic_fit(X,
4949
feat_col = Tables.getcolumn(X, feat_name)
5050
feat_type = elscitype(feat_col)
5151
feat_has_allowed_type =
52-
feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
52+
feat_type <: Union{Missing, Multiclass} ||
53+
(ordered_factor && feat_type <: Union{Missing, OrderedFactor})
5354
if feat_has_allowed_type # then should be encoded
5455
push!(encoded_features, feat_name)
5556
# Compute the dict using the given feature_mapper function
56-
mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...)
57+
mapping_per_feat_level[feat_name] =
58+
feature_mapper(feat_col, feat_name, args...; kwargs...)
5759
end
5860
end
5961
return mapping_per_feat_level, encoded_features
@@ -64,19 +66,37 @@ end
6466
"""
6567
**Private method.**
6668
67-
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n
69+
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible,
70+
feat_name_level_0, feat_name_level_1,..., feat_name_level_n
6871
"""
69-
function generate_new_feat_names(feat_name, num_inds, existing_names)
70-
conflict = true # will be kept true as long as there is a conflict
71-
count = 1 # number of conflicts+1 = number of underscores
72+
function generate_new_feat_names(
73+
feat_name,
74+
num_inds,
75+
levels,
76+
existing_names;
77+
use_levelnames = true,
78+
)
79+
# Convert levels (e.g. KeySet or Tuple) to an indexable vector
80+
levels_vec = collect(levels)
81+
82+
conflict = true # true while there's a name clash
83+
count = 1 # number of underscores in the suffix
84+
new_column_names = Symbol[]
7285

73-
new_column_names = []
7486
while conflict
75-
suffix = repeat("_", count)
76-
new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
87+
suffix = repeat("_", count)
88+
if use_levelnames
89+
# Always use the first num_inds level names
90+
new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
91+
else
92+
# Always use numeric indices
93+
new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ]
94+
end
95+
# Check for collisions
7796
conflict = any(name -> name in existing_names, new_column_names)
7897
count += 1
7998
end
99+
80100
return new_column_names
81101
end
82102

@@ -85,22 +105,32 @@ end
85105
"""
86106
**Private method.**
87107
88-
Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
108+
Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
89109
a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
90110
91-
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
92-
into a scalar (single_feat=true)
111+
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
112+
into a scalar (single_feat=true)
93113
94-
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
95-
into a set of k features where k is the length of the vector (single_feat=false)
114+
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
115+
into a set of k features where k is the length of the vector (single_feat=false)
96116
- In both cases it attempts to preserve the type of the table.
97117
- In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
98-
assumption is necessary because any column in X must correspond to a constant number of features
118+
assumption is necessary because any column in X must correspond to a constant number of features
99119
in the output table (which is equal to k).
100120
- Features not in the dictionary are mapped to themselves (i.e., not changed).
101121
- Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
122+
- use_levelnames: if true, the new feature names are generated using the level names when the transform generates multiple features;
123+
else they are generated using the indices of the levels.
124+
- custom_levels: if not nothing, then the levels of the categorical features are replaced by the custom_levels
102125
"""
103-
function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false)
126+
function generic_transform(
127+
X,
128+
mapping_per_feat_level;
129+
single_feat = true,
130+
ignore_unknown = false,
131+
use_levelnames = false,
132+
custom_levels = nothing,
133+
)
104134
feat_names = Tables.schema(X).names
105135
new_feat_names = Symbol[]
106136
new_cols = []
@@ -115,25 +145,29 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
115145
if !issubset(test_levels, train_levels)
116146
# get the levels in test that are not in train
117147
lost_levels = setdiff(test_levels, train_levels)
118-
error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.")
148+
error(
149+
"While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
150+
)
119151
end
120152
end
121-
153+
122154
if single_feat
123155
level2scalar = mapping_per_feat_level[feat_name]
124156
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
125157
push!(new_cols, new_col)
126158
push!(new_feat_names, feat_name)
127159
else
128160
level2vector = mapping_per_feat_level[feat_name]
129-
new_multi_col = map(x->get(level2vector, x, x), col)
161+
new_multi_col = map(x -> get(level2vector, x, x), col)
130162
new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
131163
push!(new_cols, new_multi_col...)
132164

133165
feat_names_with_inds = generate_new_feat_names(
134166
feat_name,
135167
length(first(mapping_per_feat_level[feat_name])[2]),
136-
feat_names,
168+
(custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels,
169+
feat_names;
170+
use_levelnames = use_levelnames,
137171
)
138172
push!(new_feat_names, feat_names_with_inds...)
139173
end
@@ -144,8 +178,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
144178
end
145179
end
146180

147-
transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
181+
transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
148182
# Attempt to preserve table type
149183
transformed_X = Tables.materializer(X)(transformed_X)
150184
return transformed_X
151-
end
185+
end

test/generic.jl

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,38 @@ push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=false, retur
2727
push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=true, return_y=false))
2828

2929
@testset "Generate New feature names Function Tests" begin
30-
# Test 1: No initial conflicts
31-
@testset "No Initial Conflicts" begin
32-
existing_names = []
33-
names = generate_new_feat_names("feat", 3, existing_names)
34-
@test names == [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
30+
levels = ("A", "B", "C")
31+
32+
# Test 1: No initial conflicts, indices mode (use_levelnames=false)
33+
@testset "No Initial Conflicts (Indices)" begin
34+
existing_names = Symbol[]
35+
names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
36+
@test names == [Symbol("feat_1"), Symbol("feat_2")]
37+
end
38+
39+
# Test 2: No conflicts, level-names mode (default use_levelnames=true)
40+
@testset "No Initial Conflicts (Level Names)" begin
41+
existing_names = Symbol[]
42+
names = generate_new_feat_names("feat", 3, levels, existing_names)
43+
@test names == [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
3544
end
3645

37-
# Test 2: Handle initial conflict by adding underscores
38-
@testset "Initial Conflict Resolution" begin
39-
existing_names = [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
40-
names = generate_new_feat_names("feat", 3, existing_names)
41-
@test names == [Symbol("feat__1"), Symbol("feat__2"), Symbol("feat__3")]
46+
# Test 3: Handle initial conflict by adding underscores (indices)
47+
@testset "Initial Conflict Resolution (Indices)" begin
48+
existing_names = [Symbol("feat_1"), Symbol("feat_2")]
49+
names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
50+
@test names == [Symbol("feat__1"), Symbol("feat__2")]
51+
end
52+
53+
# Test 4: Handle initial conflict by adding underscores (level names)
54+
@testset "Initial Conflict Resolution (Level Names)" begin
55+
existing_names = [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
56+
names = generate_new_feat_names("feat", 3, levels, existing_names)
57+
@test names == [Symbol("feat__A"), Symbol("feat__B"), Symbol("feat__C")]
4258
end
4359
end
4460

61+
4562
# Dummy encoder that maps each level to its hash (some arbitrary function)
4663
function dummy_encoder_fit(
4764
X,

0 commit comments

Comments
 (0)