Skip to content

Commit 98893ea

Browse files
authored
Merge pull request #25 from JuliaAI/use-level-names-for-categories
Use level names for categories
2 parents d9d28ed + c401f3b commit 98893ea

File tree

8 files changed

+91
-44
lines changed

8 files changed

+91
-44
lines changed

src/MLJTransforms.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ using MLJModelInterface
77
using TableOperations
88
using StatsBase
99
using LinearAlgebra
10-
10+
using OrderedCollections: OrderedDict
1111
# Other transformers
1212
using Combinatorics
1313
import Distributions

src/encoders/contrast_encoder/contrast_encoder.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ function contrast_encoder_fit(
125125
throw(ArgumentError("Mode $feat_mode is not supported."))
126126
end
127127

128-
vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
128+
vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
129129
return vector_given_value_given_feature
130130
end
131131

@@ -159,5 +159,5 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia
159159
"""
160160
function contrast_encoder_transform(X, cache::Dict)
161161
vector_given_value_given_feature = cache[:vector_given_value_given_feature]
162-
return generic_transform(X, vector_given_value_given_feature, single_feat = false)
162+
return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true)
163163
end

src/encoders/contrast_encoder/interface_mlj.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,12 +148,12 @@ mach = fit!(machine(encoder, X))
148148
Xnew = transform(mach, X)
149149
150150
julia > Xnew
151-
(name_1 = [1.0, 0.0, 0.0, 0.0],
152-
name_2 = [0.0, 1.0, 0.0, 1.0],
151+
(name_John = [1.0, 0.0, 0.0, 0.0],
152+
name_Mary = [0.0, 1.0, 0.0, 1.0],
153153
height = [1.85, 1.67, 1.5, 1.67],
154-
favnum_1 = [0.0, 1.0, 0.0, -1.0],
155-
favnum_2 = [2.0, -1.0, 0.0, -1.0],
156-
favnum_3 = [-1.0, -1.0, 3.0, -1.0],
154+
favnum_5 = [0.0, 1.0, 0.0, -1.0],
155+
favnum_7 = [2.0, -1.0, 0.0, -1.0],
156+
favnum_10 = [-1.0, -1.0, 3.0, -1.0],
157157
age = [23, 23, 14, 23],)
158158
```
159159

src/encoders/target_encoding/interface_mlj.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ struct TargetEncoderResult{
5151
y_stat_given_feat_level::Dict{A, A}
5252
task::S # "Regression", "Classification"
5353
num_classes::I # num_classes in case of classification
54+
y_classes::A # y_classes in case of classification
55+
5456
end
5557

5658

@@ -76,6 +78,7 @@ function MMI.fit(transformer::TargetEncoder, verbosity::Int, X, y)
7678
generic_cache[:y_stat_given_feat_level],
7779
generic_cache[:task],
7880
generic_cache[:num_classes],
81+
generic_cache[:y_classes],
7982
)
8083
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
8184
cache = nothing
@@ -90,6 +93,7 @@ function MMI.transform(transformer::TargetEncoder, fitresult, Xnew)
9093
fitresult.y_stat_given_feat_level,
9194
:num_classes => fitresult.num_classes,
9295
:task => fitresult.task,
96+
:y_classes => fitresult.y_classes,
9397
)
9498
Xnew_transf = target_encoder_transform(Xnew, generic_cache)
9599
return Xnew_transf

src/encoders/target_encoding/target_encoding.jl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ function target_encoder_fit(
216216
:num_classes => (task == "Regression") ? -1 : length(y_classes),
217217
:y_stat_given_feat_level => y_stat_given_feat_level,
218218
:encoded_features => encoded_features,
219+
:y_classes => (task == "Regression") ? nothing : y_classes,
219220
)
220221
return cache
221222
end
@@ -244,11 +245,13 @@ function target_encoder_transform(X, cache)
244245
task = cache[:task]
245246
y_stat_given_feat_level = cache[:y_stat_given_feat_level]
246247
num_classes = cache[:num_classes]
248+
y_classes = cache[:y_classes]
247249

248250
return generic_transform(
249251
X,
250252
y_stat_given_feat_level;
251253
single_feat = task == "Regression" || (task == "Classification" && num_classes < 3),
252-
)
254+
use_levelnames = true,
255+
custom_levels = y_classes,)
253256
end
254257

src/generic.jl

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -66,19 +66,37 @@ end
6666
"""
6767
**Private method.**
6868
69-
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n
69+
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible,
70+
feat_name_level_0, feat_name_level_1,..., feat_name_level_n
7071
"""
71-
function generate_new_feat_names(feat_name, num_inds, existing_names)
72-
conflict = true # will be kept true as long as there is a conflict
73-
count = 1 # number of conflicts+1 = number of underscores
72+
function generate_new_feat_names(
73+
feat_name,
74+
num_inds,
75+
levels,
76+
existing_names;
77+
use_levelnames = true,
78+
)
79+
# Convert levels (e.g. KeySet or Tuple) to an indexable vector
80+
levels_vec = collect(levels)
81+
82+
conflict = true # true while there's a name clash
83+
count = 1 # number of underscores in the suffix
84+
new_column_names = Symbol[]
7485

75-
new_column_names = []
7686
while conflict
7787
suffix = repeat("_", count)
78-
new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
88+
if use_levelnames
89+
# Always use the first num_inds level names
90+
new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
91+
else
92+
# Always use numeric indices
93+
new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ]
94+
end
95+
# Check for collisions
7996
conflict = any(name -> name in existing_names, new_column_names)
8097
count += 1
8198
end
99+
82100
return new_column_names
83101
end
84102

@@ -88,26 +106,30 @@ end
88106
**Private method.**
89107
90108
Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
91-
a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
109+
a subset of categorical features of X into a scalar or a vector (as specified in `single_feat`)
92110
93111
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
94-
into a scalar (single_feat=true)
112+
into a scalar (`single_feat=true`)
95113
96114
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
97-
into a set of k features where k is the length of the vector (single_feat=false)
115+
into a set of `k` features where `k` is the length of the vector (`single_feat=false`)
98116
- In both cases it attempts to preserve the type of the table.
99117
- In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
100118
assumption is necessary because any column in X must correspond to a constant number of features
101119
in the output table (which is equal to k).
102120
- Features not in the dictionary are mapped to themselves (i.e., not changed).
103-
- Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
104-
- If `ensure_categorical` is true, then any input categorical column will remain categorical
121+
- Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
122+
- use_levelnames: if true, the new feature names are generated using the level names when the transform generates multiple features;
123+
else they are generated using the indices of the levels.
124+
- custom_levels: if not `nothing`, then the levels of the categorical features are replaced by the custom_levels
105125
"""
106126
function generic_transform(
107127
X,
108128
mapping_per_feat_level;
109129
single_feat = true,
110130
ignore_unknown = false,
131+
use_levelnames = false,
132+
custom_levels = nothing,
111133
ensure_categorical = false,
112134
)
113135
feat_names = Tables.schema(X).names
@@ -149,7 +171,9 @@ function generic_transform(
149171
feat_names_with_inds = generate_new_feat_names(
150172
feat_name,
151173
length(first(mapping_per_feat_level[feat_name])[2]),
152-
feat_names,
174+
(custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels,
175+
feat_names;
176+
use_levelnames = use_levelnames,
153177
)
154178
push!(new_feat_names, feat_names_with_inds...)
155179
end

test/encoders/target_encoding.jl

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -277,22 +277,21 @@ end
277277
X_tr = target_encoder_transform(X, cache)
278278

279279
enc = (col, level) -> cache[:y_stat_given_feat_level][col][level]
280-
281280
target = (
282-
A_1 = [enc(:A, X[:A][i])[1] for i in 1:10],
283-
A_2 = [enc(:A, X[:A][i])[2] for i in 1:10],
284-
A_3 = [enc(:A, X[:A][i])[3] for i in 1:10],
281+
A_0 = [enc(:A, X[:A][i])[1] for i in 1:10],
282+
A_1 = [enc(:A, X[:A][i])[2] for i in 1:10],
283+
A_2 = [enc(:A, X[:A][i])[3] for i in 1:10],
285284
B = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
286-
C_1 = [enc(:C, X[:C][i])[1] for i in 1:10],
287-
C_2 = [enc(:C, X[:C][i])[2] for i in 1:10],
288-
C_3 = [enc(:C, X[:C][i])[3] for i in 1:10],
289-
D_1 = [enc(:D, X[:D][i])[1] for i in 1:10],
290-
D_2 = [enc(:D, X[:D][i])[2] for i in 1:10],
291-
D_3 = [enc(:D, X[:D][i])[3] for i in 1:10],
285+
C_0 = [enc(:C, X[:C][i])[1] for i in 1:10],
286+
C_1 = [enc(:C, X[:C][i])[2] for i in 1:10],
287+
C_2 = [enc(:C, X[:C][i])[3] for i in 1:10],
288+
D_0 = [enc(:D, X[:D][i])[1] for i in 1:10],
289+
D_1 = [enc(:D, X[:D][i])[2] for i in 1:10],
290+
D_2 = [enc(:D, X[:D][i])[3] for i in 1:10],
292291
E = [1, 2, 3, 4, 5, 6, 6, 3, 2, 1],
293-
F_1 = [enc(:F, X[:F][i])[1] for i in 1:10],
294-
F_2 = [enc(:F, X[:F][i])[2] for i in 1:10],
295-
F_3 = [enc(:F, X[:F][i])[3] for i in 1:10],
292+
F_0 = [enc(:F, X[:F][i])[1] for i in 1:10],
293+
F_1 = [enc(:F, X[:F][i])[2] for i in 1:10],
294+
F_2 = [enc(:F, X[:F][i])[3] for i in 1:10],
296295
)
297296
for col in keys(target)
298297
@test all(X_tr[col] .== target[col])

test/generic.jl

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,38 @@ push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=false, retur
2727
push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=true, return_y=false))
2828

2929
@testset "Generate New feature names Function Tests" begin
30-
# Test 1: No initial conflicts
31-
@testset "No Initial Conflicts" begin
32-
existing_names = []
33-
names = generate_new_feat_names("feat", 3, existing_names)
34-
@test names == [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
30+
levels = ("A", "B", "C")
31+
32+
# Test 1: No initial conflicts, indices mode (use_levelnames=false)
33+
@testset "No Initial Conflicts (Indices)" begin
34+
existing_names = Symbol[]
35+
names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
36+
@test names == [Symbol("feat_1"), Symbol("feat_2")]
37+
end
38+
39+
# Test 2: No conflicts, level-names mode (default use_levelnames=true)
40+
@testset "No Initial Conflicts (Level Names)" begin
41+
existing_names = Symbol[]
42+
names = generate_new_feat_names("feat", 3, levels, existing_names)
43+
@test names == [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
3544
end
3645

37-
# Test 2: Handle initial conflict by adding underscores
38-
@testset "Initial Conflict Resolution" begin
39-
existing_names = [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
40-
names = generate_new_feat_names("feat", 3, existing_names)
41-
@test names == [Symbol("feat__1"), Symbol("feat__2"), Symbol("feat__3")]
46+
# Test 3: Handle initial conflict by adding underscores (indices)
47+
@testset "Initial Conflict Resolution (Indices)" begin
48+
existing_names = [Symbol("feat_1"), Symbol("feat_2")]
49+
names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
50+
@test names == [Symbol("feat__1"), Symbol("feat__2")]
51+
end
52+
53+
# Test 4: Handle initial conflict by adding underscores (level names)
54+
@testset "Initial Conflict Resolution (Level Names)" begin
55+
existing_names = [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
56+
names = generate_new_feat_names("feat", 3, levels, existing_names)
57+
@test names == [Symbol("feat__A"), Symbol("feat__B"), Symbol("feat__C")]
4258
end
4359
end
4460

61+
4562
# Dummy encoder that maps each level to its hash (some arbitrary function)
4663
function dummy_encoder_fit(
4764
X,

0 commit comments

Comments
 (0)