Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ meh/*.ipynb
.DS_Store
/*.jl
scratchpad/
examples/test.jl
2 changes: 1 addition & 1 deletion src/MLJTransforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using MLJModelInterface
using TableOperations
using StatsBase
using LinearAlgebra

using OrderedCollections: OrderedDict
# Other transformers
using Combinatorics
import Distributions
Expand Down
4 changes: 2 additions & 2 deletions src/encoders/contrast_encoder/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ function contrast_encoder_fit(
throw(ArgumentError("Mode $feat_mode is not supported."))
end

vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
return vector_given_value_given_feature
end

Expand Down Expand Up @@ -159,5 +159,5 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia
"""
function contrast_encoder_transform(X, cache::Dict)
vector_given_value_given_feature = cache[:vector_given_value_given_feature]
return generic_transform(X, vector_given_value_given_feature, single_feat = false)
return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true)
end
10 changes: 5 additions & 5 deletions src/encoders/contrast_encoder/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,12 @@ mach = fit!(machine(encoder, X))
Xnew = transform(mach, X)

julia > Xnew
(name_1 = [1.0, 0.0, 0.0, 0.0],
name_2 = [0.0, 1.0, 0.0, 1.0],
(name_John = [1.0, 0.0, 0.0, 0.0],
name_Mary = [0.0, 1.0, 0.0, 1.0],
height = [1.85, 1.67, 1.5, 1.67],
favnum_1 = [0.0, 1.0, 0.0, -1.0],
favnum_2 = [2.0, -1.0, 0.0, -1.0],
favnum_3 = [-1.0, -1.0, 3.0, -1.0],
favnum_5 = [0.0, 1.0, 0.0, -1.0],
favnum_7 = [2.0, -1.0, 0.0, -1.0],
favnum_10 = [-1.0, -1.0, 3.0, -1.0],
age = [23, 23, 14, 23],)
```

Expand Down
4 changes: 4 additions & 0 deletions src/encoders/target_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ struct TargetEncoderResult{
y_stat_given_feat_level::Dict{A, A}
task::S # "Regression", "Classification"
num_classes::I # num_classes in case of classification
y_classes::A # y_classes in case of classification

end


Expand All @@ -76,6 +78,7 @@ function MMI.fit(transformer::TargetEncoder, verbosity::Int, X, y)
generic_cache[:y_stat_given_feat_level],
generic_cache[:task],
generic_cache[:num_classes],
generic_cache[:y_classes],
)
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
cache = nothing
Expand All @@ -90,6 +93,7 @@ function MMI.transform(transformer::TargetEncoder, fitresult, Xnew)
fitresult.y_stat_given_feat_level,
:num_classes => fitresult.num_classes,
:task => fitresult.task,
:y_classes => fitresult.y_classes,
)
Xnew_transf = target_encoder_transform(Xnew, generic_cache)
return Xnew_transf
Expand Down
5 changes: 4 additions & 1 deletion src/encoders/target_encoding/target_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ function target_encoder_fit(
:num_classes => (task == "Regression") ? -1 : length(y_classes),
:y_stat_given_feat_level => y_stat_given_feat_level,
:encoded_features => encoded_features,
:y_classes => (task == "Regression") ? nothing : y_classes,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine but I'm curious, is there a reason to use a dictionary for the cache? More standard would be to use a named tuple, or if this needs to be mutable, a mutable struct.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There isn't indeed and I plan to replace them all with named-tuples as in #5. Such action would be deterministic so I was waiting to merge the encoding types PR (and maybe this one) to avoid conflicts.

)
return cache
end
Expand Down Expand Up @@ -243,11 +244,13 @@ function target_encoder_transform(X, cache)
task = cache[:task]
y_stat_given_feat_level = cache[:y_stat_given_feat_level]
num_classes = cache[:num_classes]
y_classes = cache[:y_classes]

return generic_transform(
X,
y_stat_given_feat_level;
single_feat = task == "Regression" || (task == "Classification" && num_classes < 3),
)
use_levelnames = true,
custom_levels = y_classes,)
end

78 changes: 56 additions & 22 deletions src/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,13 @@ function generic_fit(X,
feat_col = Tables.getcolumn(X, feat_name)
feat_type = elscitype(feat_col)
feat_has_allowed_type =
feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
feat_type <: Union{Missing, Multiclass} ||
(ordered_factor && feat_type <: Union{Missing, OrderedFactor})
if feat_has_allowed_type # then should be encoded
push!(encoded_features, feat_name)
# Compute the dict using the given feature_mapper function
mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...)
mapping_per_feat_level[feat_name] =
feature_mapper(feat_col, feat_name, args...; kwargs...)
end
end
return mapping_per_feat_level, encoded_features
Expand All @@ -64,19 +66,37 @@ end
"""
**Private method.**

Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible,
feat_name_level_0, feat_name_level_1,..., feat_name_level_n
"""
function generate_new_feat_names(feat_name, num_inds, existing_names)
conflict = true # will be kept true as long as there is a conflict
count = 1 # number of conflicts+1 = number of underscores
function generate_new_feat_names(
feat_name,
num_inds,
levels,
existing_names;
use_levelnames = true,
)
# Convert levels (e.g. KeySet or Tuple) to an indexable vector
levels_vec = collect(levels)

conflict = true # true while there's a name clash
count = 1 # number of underscores in the suffix
new_column_names = Symbol[]

new_column_names = []
while conflict
suffix = repeat("_", count)
new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
suffix = repeat("_", count)
if use_levelnames
# Always use the first num_inds level names
new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
else
# Always use numeric indices
new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ]
end
# Check for collisions
conflict = any(name -> name in existing_names, new_column_names)
count += 1
end

return new_column_names
end

Expand All @@ -85,22 +105,32 @@ end
"""
**Private method.**

Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
a subset of categorical features of X into a scalar or a vector (as specified in single_feat)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a scalar (single_feat=true)
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a scalar (single_feat=true)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a set of k features where k is the length of the vector (single_feat=false)
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a set of k features where k is the length of the vector (single_feat=false)
- In both cases it attempts to preserve the type of the table.
- In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
assumption is necessary because any column in X must correspond to a constant number of features
assumption is necessary because any column in X must correspond to a constant number of features
in the output table (which is equal to k).
- Features not in the dictionary are mapped to themselves (i.e., not changed).
- Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
- use_levelnames: if true, the new feature names are generated using the level names when the transform generates multiple features;
else they are generated using the indices of the levels.
- custom_levels: if not nothing, then the levels of the categorical features are replaced by the custom_levels
"""
function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false)
function generic_transform(
X,
mapping_per_feat_level;
single_feat = true,
ignore_unknown = false,
use_levelnames = false,
custom_levels = nothing,
)
feat_names = Tables.schema(X).names
new_feat_names = Symbol[]
new_cols = []
Expand All @@ -115,25 +145,29 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
if !issubset(test_levels, train_levels)
# get the levels in test that are not in train
lost_levels = setdiff(test_levels, train_levels)
error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.")
error(
"While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
)
end
end

if single_feat
level2scalar = mapping_per_feat_level[feat_name]
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
push!(new_cols, new_col)
push!(new_feat_names, feat_name)
else
level2vector = mapping_per_feat_level[feat_name]
new_multi_col = map(x->get(level2vector, x, x), col)
new_multi_col = map(x -> get(level2vector, x, x), col)
new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
push!(new_cols, new_multi_col...)

feat_names_with_inds = generate_new_feat_names(
feat_name,
length(first(mapping_per_feat_level[feat_name])[2]),
feat_names,
(custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels,
feat_names;
use_levelnames = use_levelnames,
)
push!(new_feat_names, feat_names_with_inds...)
end
Expand All @@ -144,8 +178,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
end
end

transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
# Attempt to preserve table type
transformed_X = Tables.materializer(X)(transformed_X)
return transformed_X
end
end
25 changes: 12 additions & 13 deletions test/encoders/target_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -277,22 +277,21 @@ end
X_tr = target_encoder_transform(X, cache)

enc = (col, level) -> cache[:y_stat_given_feat_level][col][level]

target = (
A_1 = [enc(:A, X[:A][i])[1] for i in 1:10],
A_2 = [enc(:A, X[:A][i])[2] for i in 1:10],
A_3 = [enc(:A, X[:A][i])[3] for i in 1:10],
A_0 = [enc(:A, X[:A][i])[1] for i in 1:10],
A_1 = [enc(:A, X[:A][i])[2] for i in 1:10],
A_2 = [enc(:A, X[:A][i])[3] for i in 1:10],
B = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
C_1 = [enc(:C, X[:C][i])[1] for i in 1:10],
C_2 = [enc(:C, X[:C][i])[2] for i in 1:10],
C_3 = [enc(:C, X[:C][i])[3] for i in 1:10],
D_1 = [enc(:D, X[:D][i])[1] for i in 1:10],
D_2 = [enc(:D, X[:D][i])[2] for i in 1:10],
D_3 = [enc(:D, X[:D][i])[3] for i in 1:10],
C_0 = [enc(:C, X[:C][i])[1] for i in 1:10],
C_1 = [enc(:C, X[:C][i])[2] for i in 1:10],
C_2 = [enc(:C, X[:C][i])[3] for i in 1:10],
D_0 = [enc(:D, X[:D][i])[1] for i in 1:10],
D_1 = [enc(:D, X[:D][i])[2] for i in 1:10],
D_2 = [enc(:D, X[:D][i])[3] for i in 1:10],
E = [1, 2, 3, 4, 5, 6, 6, 3, 2, 1],
F_1 = [enc(:F, X[:F][i])[1] for i in 1:10],
F_2 = [enc(:F, X[:F][i])[2] for i in 1:10],
F_3 = [enc(:F, X[:F][i])[3] for i in 1:10],
F_0 = [enc(:F, X[:F][i])[1] for i in 1:10],
F_1 = [enc(:F, X[:F][i])[2] for i in 1:10],
F_2 = [enc(:F, X[:F][i])[3] for i in 1:10],
)
for col in keys(target)
@test all(X_tr[col] .== target[col])
Expand Down
37 changes: 27 additions & 10 deletions test/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,38 @@ push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=false, retur
push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=true, return_y=false))

@testset "Generate New feature names Function Tests" begin
# Test 1: No initial conflicts
@testset "No Initial Conflicts" begin
existing_names = []
names = generate_new_feat_names("feat", 3, existing_names)
@test names == [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
levels = ("A", "B", "C")

# Test 1: No initial conflicts, indices mode (use_levelnames=false)
@testset "No Initial Conflicts (Indices)" begin
existing_names = Symbol[]
names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
@test names == [Symbol("feat_1"), Symbol("feat_2")]
end

# Test 2: No conflicts, level-names mode (default use_levelnames=true)
@testset "No Initial Conflicts (Level Names)" begin
existing_names = Symbol[]
names = generate_new_feat_names("feat", 3, levels, existing_names)
@test names == [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
end

# Test 2: Handle initial conflict by adding underscores
@testset "Initial Conflict Resolution" begin
existing_names = [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
names = generate_new_feat_names("feat", 3, existing_names)
@test names == [Symbol("feat__1"), Symbol("feat__2"), Symbol("feat__3")]
# Test 3: Handle initial conflict by adding underscores (indices)
@testset "Initial Conflict Resolution (Indices)" begin
existing_names = [Symbol("feat_1"), Symbol("feat_2")]
names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
@test names == [Symbol("feat__1"), Symbol("feat__2")]
end

# Test 4: Handle initial conflict by adding underscores (level names)
@testset "Initial Conflict Resolution (Level Names)" begin
existing_names = [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
names = generate_new_feat_names("feat", 3, levels, existing_names)
@test names == [Symbol("feat__A"), Symbol("feat__B"), Symbol("feat__C")]
end
end


# Dummy encoder that maps each level to its hash (some arbitrary function)
function dummy_encoder_fit(
X,
Expand Down
Loading