Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/MLJTransforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using MLJModelInterface
using TableOperations
using StatsBase
using LinearAlgebra

using OrderedCollections: OrderedDict
# Other transformers
using Combinatorics
import Distributions
Expand Down
4 changes: 2 additions & 2 deletions src/encoders/contrast_encoder/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ function contrast_encoder_fit(
throw(ArgumentError("Mode $feat_mode is not supported."))
end

vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
return vector_given_value_given_feature
end

Expand Down Expand Up @@ -159,5 +159,5 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia
"""
function contrast_encoder_transform(X, cache::Dict)
vector_given_value_given_feature = cache[:vector_given_value_given_feature]
return generic_transform(X, vector_given_value_given_feature, single_feat = false)
return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true)
end
10 changes: 5 additions & 5 deletions src/encoders/contrast_encoder/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,12 @@ mach = fit!(machine(encoder, X))
Xnew = transform(mach, X)

julia > Xnew
(name_1 = [1.0, 0.0, 0.0, 0.0],
name_2 = [0.0, 1.0, 0.0, 1.0],
(name_John = [1.0, 0.0, 0.0, 0.0],
name_Mary = [0.0, 1.0, 0.0, 1.0],
height = [1.85, 1.67, 1.5, 1.67],
favnum_1 = [0.0, 1.0, 0.0, -1.0],
favnum_2 = [2.0, -1.0, 0.0, -1.0],
favnum_3 = [-1.0, -1.0, 3.0, -1.0],
favnum_5 = [0.0, 1.0, 0.0, -1.0],
favnum_7 = [2.0, -1.0, 0.0, -1.0],
favnum_10 = [-1.0, -1.0, 3.0, -1.0],
age = [23, 23, 14, 23],)
```

Expand Down
4 changes: 4 additions & 0 deletions src/encoders/target_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ struct TargetEncoderResult{
y_stat_given_feat_level::Dict{A, A}
task::S # "Regression", "Classification"
num_classes::I # num_classes in case of classification
y_classes::A # y_classes in case of classification

end


Expand All @@ -76,6 +78,7 @@ function MMI.fit(transformer::TargetEncoder, verbosity::Int, X, y)
generic_cache[:y_stat_given_feat_level],
generic_cache[:task],
generic_cache[:num_classes],
generic_cache[:y_classes],
)
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
cache = nothing
Expand All @@ -90,6 +93,7 @@ function MMI.transform(transformer::TargetEncoder, fitresult, Xnew)
fitresult.y_stat_given_feat_level,
:num_classes => fitresult.num_classes,
:task => fitresult.task,
:y_classes => fitresult.y_classes,
)
Xnew_transf = target_encoder_transform(Xnew, generic_cache)
return Xnew_transf
Expand Down
5 changes: 4 additions & 1 deletion src/encoders/target_encoding/target_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ function target_encoder_fit(
:num_classes => (task == "Regression") ? -1 : length(y_classes),
:y_stat_given_feat_level => y_stat_given_feat_level,
:encoded_features => encoded_features,
:y_classes => (task == "Regression") ? nothing : y_classes,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine but I'm curious, is there a reason to use a dictionary for the cache? More standard would be to use a named tuple, or if this needs to be mutable, a mutable struct.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There isn't indeed and I plan to replace them all with named-tuples as in #5. Such action would be deterministic so I was waiting to merge the encoding types PR (and maybe this one) to avoid conflicts.

)
return cache
end
Expand Down Expand Up @@ -244,11 +245,13 @@ function target_encoder_transform(X, cache)
task = cache[:task]
y_stat_given_feat_level = cache[:y_stat_given_feat_level]
num_classes = cache[:num_classes]
y_classes = cache[:y_classes]

return generic_transform(
X,
y_stat_given_feat_level;
single_feat = task == "Regression" || (task == "Classification" && num_classes < 3),
)
use_levelnames = true,
custom_levels = y_classes,)
end

48 changes: 36 additions & 12 deletions src/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,19 +66,37 @@ end
"""
**Private method.**

Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible,
feat_name_level_0, feat_name_level_1,..., feat_name_level_n
"""
function generate_new_feat_names(feat_name, num_inds, existing_names)
conflict = true # will be kept true as long as there is a conflict
count = 1 # number of conflicts+1 = number of underscores
function generate_new_feat_names(
feat_name,
num_inds,
levels,
existing_names;
use_levelnames = true,
)
# Convert levels (e.g. KeySet or Tuple) to an indexable vector
levels_vec = collect(levels)

conflict = true # true while there's a name clash
count = 1 # number of underscores in the suffix
new_column_names = Symbol[]

new_column_names = []
while conflict
suffix = repeat("_", count)
new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
if use_levelnames
# Always use the first num_inds level names
new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
else
# Always use numeric indices
new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ]
end
# Check for collisions
conflict = any(name -> name in existing_names, new_column_names)
count += 1
end

return new_column_names
end

Expand All @@ -88,26 +106,30 @@ end
**Private method.**

Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
a subset of categorical features of X into a scalar or a vector (as specified in `single_feat`)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a scalar (single_feat=true)
into a scalar (`single_feat=true`)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a set of k features where k is the length of the vector (single_feat=false)
into a set of `k` features where `k` is the length of the vector (`single_feat=false`)
- In both cases it attempts to preserve the type of the table.
- In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
assumption is necessary because any column in X must correspond to a constant number of features
in the output table (which is equal to k).
- Features not in the dictionary are mapped to themselves (i.e., not changed).
- Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
- If `ensure_categorical` is true, then any input categorical column will remain categorical
- Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
- use_levelnames: if true, the new feature names are generated using the level names when the transform generates multiple features;
else they are generated using the indices of the levels.
- custom_levels: if not `nothing`, then the levels of the categorical features are replaced by the custom_levels
"""
function generic_transform(
X,
mapping_per_feat_level;
single_feat = true,
ignore_unknown = false,
use_levelnames = false,
custom_levels = nothing,
ensure_categorical = false,
)
feat_names = Tables.schema(X).names
Expand Down Expand Up @@ -149,7 +171,9 @@ function generic_transform(
feat_names_with_inds = generate_new_feat_names(
feat_name,
length(first(mapping_per_feat_level[feat_name])[2]),
feat_names,
(custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels,
feat_names;
use_levelnames = use_levelnames,
)
push!(new_feat_names, feat_names_with_inds...)
end
Expand Down
25 changes: 12 additions & 13 deletions test/encoders/target_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -277,22 +277,21 @@ end
X_tr = target_encoder_transform(X, cache)

enc = (col, level) -> cache[:y_stat_given_feat_level][col][level]

target = (
A_1 = [enc(:A, X[:A][i])[1] for i in 1:10],
A_2 = [enc(:A, X[:A][i])[2] for i in 1:10],
A_3 = [enc(:A, X[:A][i])[3] for i in 1:10],
A_0 = [enc(:A, X[:A][i])[1] for i in 1:10],
A_1 = [enc(:A, X[:A][i])[2] for i in 1:10],
A_2 = [enc(:A, X[:A][i])[3] for i in 1:10],
B = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
C_1 = [enc(:C, X[:C][i])[1] for i in 1:10],
C_2 = [enc(:C, X[:C][i])[2] for i in 1:10],
C_3 = [enc(:C, X[:C][i])[3] for i in 1:10],
D_1 = [enc(:D, X[:D][i])[1] for i in 1:10],
D_2 = [enc(:D, X[:D][i])[2] for i in 1:10],
D_3 = [enc(:D, X[:D][i])[3] for i in 1:10],
C_0 = [enc(:C, X[:C][i])[1] for i in 1:10],
C_1 = [enc(:C, X[:C][i])[2] for i in 1:10],
C_2 = [enc(:C, X[:C][i])[3] for i in 1:10],
D_0 = [enc(:D, X[:D][i])[1] for i in 1:10],
D_1 = [enc(:D, X[:D][i])[2] for i in 1:10],
D_2 = [enc(:D, X[:D][i])[3] for i in 1:10],
E = [1, 2, 3, 4, 5, 6, 6, 3, 2, 1],
F_1 = [enc(:F, X[:F][i])[1] for i in 1:10],
F_2 = [enc(:F, X[:F][i])[2] for i in 1:10],
F_3 = [enc(:F, X[:F][i])[3] for i in 1:10],
F_0 = [enc(:F, X[:F][i])[1] for i in 1:10],
F_1 = [enc(:F, X[:F][i])[2] for i in 1:10],
F_2 = [enc(:F, X[:F][i])[3] for i in 1:10],
)
for col in keys(target)
@test all(X_tr[col] .== target[col])
Expand Down
37 changes: 27 additions & 10 deletions test/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,38 @@ push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=false, retur
push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=true, return_y=false))

@testset "Generate New feature names Function Tests" begin
# Test 1: No initial conflicts
@testset "No Initial Conflicts" begin
existing_names = []
names = generate_new_feat_names("feat", 3, existing_names)
@test names == [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
levels = ("A", "B", "C")

# Test 1: No initial conflicts, indices mode (use_levelnames=false)
@testset "No Initial Conflicts (Indices)" begin
existing_names = Symbol[]
names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
@test names == [Symbol("feat_1"), Symbol("feat_2")]
end

# Test 2: No conflicts, level-names mode (default use_levelnames=true)
@testset "No Initial Conflicts (Level Names)" begin
existing_names = Symbol[]
names = generate_new_feat_names("feat", 3, levels, existing_names)
@test names == [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
end

# Test 2: Handle initial conflict by adding underscores
@testset "Initial Conflict Resolution" begin
existing_names = [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
names = generate_new_feat_names("feat", 3, existing_names)
@test names == [Symbol("feat__1"), Symbol("feat__2"), Symbol("feat__3")]
# Test 3: Handle initial conflict by adding underscores (indices)
@testset "Initial Conflict Resolution (Indices)" begin
existing_names = [Symbol("feat_1"), Symbol("feat_2")]
names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
@test names == [Symbol("feat__1"), Symbol("feat__2")]
end

# Test 4: Handle initial conflict by adding underscores (level names)
@testset "Initial Conflict Resolution (Level Names)" begin
existing_names = [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
names = generate_new_feat_names("feat", 3, levels, existing_names)
@test names == [Symbol("feat__A"), Symbol("feat__B"), Symbol("feat__C")]
end
end


# Dummy encoder that maps each level to its hash (some arbitrary function)
function dummy_encoder_fit(
X,
Expand Down
Loading