diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl index f490241..dd5fa71 100644 --- a/src/MLJTransforms.jl +++ b/src/MLJTransforms.jl @@ -7,7 +7,7 @@ using MLJModelInterface using TableOperations using StatsBase using LinearAlgebra - +using OrderedCollections: OrderedDict # Other transformers using Combinatorics import Distributions diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl index b6cdcb0..486e7e9 100644 --- a/src/encoders/contrast_encoder/contrast_encoder.jl +++ b/src/encoders/contrast_encoder/contrast_encoder.jl @@ -125,7 +125,7 @@ function contrast_encoder_fit( throw(ArgumentError("Mode $feat_mode is not supported.")) end - vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)) + vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)) return vector_given_value_given_feature end @@ -159,5 +159,5 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia """ function contrast_encoder_transform(X, cache::Dict) vector_given_value_given_feature = cache[:vector_given_value_given_feature] - return generic_transform(X, vector_given_value_given_feature, single_feat = false) + return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true) end \ No newline at end of file diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl index 9c098fe..6585361 100644 --- a/src/encoders/contrast_encoder/interface_mlj.jl +++ b/src/encoders/contrast_encoder/interface_mlj.jl @@ -148,12 +148,12 @@ mach = fit!(machine(encoder, X)) Xnew = transform(mach, X) julia > Xnew - (name_1 = [1.0, 0.0, 0.0, 0.0], - name_2 = [0.0, 1.0, 0.0, 1.0], + (name_John = [1.0, 0.0, 0.0, 0.0], + name_Mary = [0.0, 1.0, 0.0, 1.0], height = [1.85, 1.67, 1.5, 1.67], - favnum_1 = [0.0, 1.0, 0.0, -1.0], - favnum_2 = [2.0, -1.0, 0.0, -1.0], - favnum_3 = [-1.0, -1.0, 3.0, -1.0], + favnum_5 = [0.0, 1.0, 0.0, -1.0], + favnum_7 = [2.0, -1.0, 0.0, -1.0], + favnum_10 = [-1.0, -1.0, 3.0, -1.0], age = [23, 23, 14, 23],) ``` diff --git a/src/encoders/target_encoding/interface_mlj.jl b/src/encoders/target_encoding/interface_mlj.jl index b416b90..686c48d 100644 --- a/src/encoders/target_encoding/interface_mlj.jl +++ b/src/encoders/target_encoding/interface_mlj.jl @@ -51,6 +51,8 @@ struct TargetEncoderResult{ y_stat_given_feat_level::Dict{A, A} task::S # "Regression", "Classification" num_classes::I # num_classes in case of classification + y_classes::A # y_classes in case of classification + end @@ -76,6 +78,7 @@ function MMI.fit(transformer::TargetEncoder, verbosity::Int, X, y) generic_cache[:y_stat_given_feat_level], generic_cache[:task], generic_cache[:num_classes], + generic_cache[:y_classes], ) report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features cache = nothing @@ -90,6 +93,7 @@ function MMI.transform(transformer::TargetEncoder, fitresult, Xnew) fitresult.y_stat_given_feat_level, :num_classes => fitresult.num_classes, :task => fitresult.task, + :y_classes => fitresult.y_classes, ) Xnew_transf = target_encoder_transform(Xnew, generic_cache) return Xnew_transf diff --git a/src/encoders/target_encoding/target_encoding.jl b/src/encoders/target_encoding/target_encoding.jl index 6dae479..e70d628 100644 --- a/src/encoders/target_encoding/target_encoding.jl +++ b/src/encoders/target_encoding/target_encoding.jl @@ -216,6 +216,7 @@ function target_encoder_fit( :num_classes => (task == "Regression") ? -1 : length(y_classes), :y_stat_given_feat_level => y_stat_given_feat_level, :encoded_features => encoded_features, + :y_classes => (task == "Regression") ? nothing : y_classes, ) return cache end @@ -244,11 +245,13 @@ function target_encoder_transform(X, cache) task = cache[:task] y_stat_given_feat_level = cache[:y_stat_given_feat_level] num_classes = cache[:num_classes] + y_classes = cache[:y_classes] return generic_transform( X, y_stat_given_feat_level; single_feat = task == "Regression" || (task == "Classification" && num_classes < 3), - ) + use_levelnames = true, + custom_levels = y_classes,) end diff --git a/src/generic.jl b/src/generic.jl index 80b2d24..8e909b0 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -66,19 +66,37 @@ end """ **Private method.** -Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n +Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible, +feat_name_level_0, feat_name_level_1,..., feat_name_level_n """ -function generate_new_feat_names(feat_name, num_inds, existing_names) - conflict = true # will be kept true as long as there is a conflict - count = 1 # number of conflicts+1 = number of underscores +function generate_new_feat_names( + feat_name, + num_inds, + levels, + existing_names; + use_levelnames = true, +) + # Convert levels (e.g. KeySet or Tuple) to an indexable vector + levels_vec = collect(levels) + + conflict = true # true while there's a name clash + count = 1 # number of underscores in the suffix + new_column_names = Symbol[] - new_column_names = [] while conflict suffix = repeat("_", count) - new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds] + if use_levelnames + # Always use the first num_inds level names + new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ] + else + # Always use numeric indices + new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ] + end + # Check for collisions conflict = any(name -> name in existing_names, new_column_names) count += 1 end + return new_column_names end @@ -88,26 +106,30 @@ end **Private method.** Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in -a subset of categorical features of X into a scalar or a vector (as specified in single_feat) +a subset of categorical features of X into a scalar or a vector (as specified in `single_feat`) - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` - into a scalar (single_feat=true) + into a scalar (`single_feat=true`) - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` - into a set of k features where k is the length of the vector (single_feat=false) + into a set of `k` features where `k` is the length of the vector (`single_feat=false`) - In both cases it attempts to preserve the type of the table. - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such assumption is necessary because any column in X must correspond to a constant number of features in the output table (which is equal to k). - Features not in the dictionary are mapped to themselves (i.e., not changed). - - Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error. - - If `ensure_categorical` is true, then any input categorical column will remain categorical + - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error. + - use_levelnames: if true, the new feature names are generated using the level names when the transform generates multiple features; + else they are generated using the indices of the levels. + - custom_levels: if not `nothing`, then the levels of the categorical features are replaced by the custom_levels """ function generic_transform( X, mapping_per_feat_level; single_feat = true, ignore_unknown = false, + use_levelnames = false, + custom_levels = nothing, ensure_categorical = false, ) feat_names = Tables.schema(X).names @@ -149,7 +171,9 @@ function generic_transform( feat_names_with_inds = generate_new_feat_names( feat_name, length(first(mapping_per_feat_level[feat_name])[2]), - feat_names, + (custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels, + feat_names; + use_levelnames = use_levelnames, ) push!(new_feat_names, feat_names_with_inds...) end diff --git a/test/encoders/target_encoding.jl b/test/encoders/target_encoding.jl index 946379e..4e6f0d0 100644 --- a/test/encoders/target_encoding.jl +++ b/test/encoders/target_encoding.jl @@ -277,22 +277,21 @@ end X_tr = target_encoder_transform(X, cache) enc = (col, level) -> cache[:y_stat_given_feat_level][col][level] - target = ( - A_1 = [enc(:A, X[:A][i])[1] for i in 1:10], - A_2 = [enc(:A, X[:A][i])[2] for i in 1:10], - A_3 = [enc(:A, X[:A][i])[3] for i in 1:10], + A_0 = [enc(:A, X[:A][i])[1] for i in 1:10], + A_1 = [enc(:A, X[:A][i])[2] for i in 1:10], + A_2 = [enc(:A, X[:A][i])[3] for i in 1:10], B = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], - C_1 = [enc(:C, X[:C][i])[1] for i in 1:10], - C_2 = [enc(:C, X[:C][i])[2] for i in 1:10], - C_3 = [enc(:C, X[:C][i])[3] for i in 1:10], - D_1 = [enc(:D, X[:D][i])[1] for i in 1:10], - D_2 = [enc(:D, X[:D][i])[2] for i in 1:10], - D_3 = [enc(:D, X[:D][i])[3] for i in 1:10], + C_0 = [enc(:C, X[:C][i])[1] for i in 1:10], + C_1 = [enc(:C, X[:C][i])[2] for i in 1:10], + C_2 = [enc(:C, X[:C][i])[3] for i in 1:10], + D_0 = [enc(:D, X[:D][i])[1] for i in 1:10], + D_1 = [enc(:D, X[:D][i])[2] for i in 1:10], + D_2 = [enc(:D, X[:D][i])[3] for i in 1:10], E = [1, 2, 3, 4, 5, 6, 6, 3, 2, 1], - F_1 = [enc(:F, X[:F][i])[1] for i in 1:10], - F_2 = [enc(:F, X[:F][i])[2] for i in 1:10], - F_3 = [enc(:F, X[:F][i])[3] for i in 1:10], + F_0 = [enc(:F, X[:F][i])[1] for i in 1:10], + F_1 = [enc(:F, X[:F][i])[2] for i in 1:10], + F_2 = [enc(:F, X[:F][i])[3] for i in 1:10], ) for col in keys(target) @test all(X_tr[col] .== target[col]) diff --git a/test/generic.jl b/test/generic.jl index ffaa4ae..4d9a805 100644 --- a/test/generic.jl +++ b/test/generic.jl @@ -27,21 +27,38 @@ push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=false, retur push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=true, return_y=false)) @testset "Generate New feature names Function Tests" begin - # Test 1: No initial conflicts - @testset "No Initial Conflicts" begin - existing_names = [] - names = generate_new_feat_names("feat", 3, existing_names) - @test names == [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")] + levels = ("A", "B", "C") + + # Test 1: No initial conflicts, indices mode (use_levelnames=false) + @testset "No Initial Conflicts (Indices)" begin + existing_names = Symbol[] + names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false) + @test names == [Symbol("feat_1"), Symbol("feat_2")] + end + + # Test 2: No conflicts, level-names mode (default use_levelnames=true) + @testset "No Initial Conflicts (Level Names)" begin + existing_names = Symbol[] + names = generate_new_feat_names("feat", 3, levels, existing_names) + @test names == [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")] end - # Test 2: Handle initial conflict by adding underscores - @testset "Initial Conflict Resolution" begin - existing_names = [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")] - names = generate_new_feat_names("feat", 3, existing_names) - @test names == [Symbol("feat__1"), Symbol("feat__2"), Symbol("feat__3")] + # Test 3: Handle initial conflict by adding underscores (indices) + @testset "Initial Conflict Resolution (Indices)" begin + existing_names = [Symbol("feat_1"), Symbol("feat_2")] + names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false) + @test names == [Symbol("feat__1"), Symbol("feat__2")] + end + + # Test 4: Handle initial conflict by adding underscores (level names) + @testset "Initial Conflict Resolution (Level Names)" begin + existing_names = [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")] + names = generate_new_feat_names("feat", 3, levels, existing_names) + @test names == [Symbol("feat__A"), Symbol("feat__B"), Symbol("feat__C")] end end + # Dummy encoder that maps each level to its hash (some arbitrary function) function dummy_encoder_fit( X,