From 7f12234a1a9489d471012ab8d2e9116c6826c237 Mon Sep 17 00:00:00 2001 From: Essam Date: Fri, 16 May 2025 07:46:36 +0300 Subject: [PATCH 1/7] =?UTF-8?q?=E2=9C=A8=20Add=20callable=20features=20and?= =?UTF-8?q?=20better=20error=20testing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/generic.jl | 31 ++++++++++++++++-------- test/encoders/contrast_encoder.jl | 25 ++++++++++++++----- test/encoders/missingness_encoding.jl | 13 +++++++--- test/encoders/target_encoding.jl | 14 ++++++----- test/generic.jl | 23 +++++++++++++++++- test/transformers/cardinality_reducer.jl | 16 ++++++++---- 6 files changed, 90 insertions(+), 32 deletions(-) diff --git a/src/generic.jl b/src/generic.jl index 3acc398..3fce0a7 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -13,21 +13,22 @@ logic?" # Arguments - - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) - `Multiclass` or `OrderedFactor` - - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - - `ignore=true`: Whether to exclude or includes the features given in `features` - - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them - - `feature_mapper`: Defined above. + - X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) + Multiclass or OrderedFactor + - features=[]: A list of names of categorical features given as symbols to exclude or include from encoding, + or a callable that returns true for features to be included/excluded + - ignore=true: Whether to exclude or includes the features given in features + - ordered_factor=false: Whether to encode OrderedFactor or ignore them + - feature_mapper: Defined above. # Returns - - `mapping_per_feat_level`: Maps each level for each feature in a subset of the categorical features of + - mapping_per_feat_level: Maps each level for each feature in a subset of the categorical features of X into a scalar or a vector. - - `encoded_features`: The subset of the categorical features of X that were encoded + - encoded_features: The subset of the categorical features of X that were encoded """ function generic_fit(X, - features::AbstractVector{Symbol} = Symbol[], + features::Union{AbstractVector{Symbol}, Function} = Symbol[], args...; ignore::Bool = true, ordered_factor::Bool = false, @@ -38,7 +39,17 @@ function generic_fit(X, feat_names = Tables.schema(X).names #2. Modify column_names based on features - feat_names = (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) + if features isa Function + # If features is a callable, apply it to each feature name + if ignore + feat_names = filter(name -> !features(name), feat_names) + else + feat_names = filter(features, feat_names) + end + else + # Original behavior for vector of symbols + feat_names = (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) + end # 3. Define mapping per column per level dictionary mapping_per_feat_level = Dict() diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl index fa110c7..c47b46e 100644 --- a/test/encoders/contrast_encoder.jl +++ b/test/encoders/contrast_encoder.jl @@ -9,7 +9,6 @@ age = [23, 23, 14, 23]) @testset "Contrast Encoder Error Handling" begin - # Example definitions to allow the test to run function dummy_buildmatrix(colname, k) # Simple dummy function to generate a matrix of correct size @@ -23,21 +22,35 @@ age = [23, 23, 14, 23]) ) # Test IGNORE_MUST_FALSE_VEC_MODE error - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true) + @test_throws MLJTransforms.IGNORE_MUST_FALSE_VEC_MODE begin + contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true) + end # Test LENGTH_MISMATCH_VEC_MODE error - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false) + @test_throws MLJTransforms.LENGTH_MISMATCH_VEC_MODE(2, 1) begin + contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false) + end # Test BUILDFUNC_MUST_BE_SPECIFIED error - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false) + @test_throws MLJTransforms.BUILDFUNC_MUST_BE_SPECIFIED begin + contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false) + end # Test MATRIX_SIZE_ERROR wrong_buildmatrix = (levels, k) -> randn(k, k) # Incorrect dimensions - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false) + k = 3 # Number of levels in data[:A] + wrong_size = (k, k) + @test_throws MLJTransforms.MATRIX_SIZE_ERROR(k, wrong_size, :A) begin + contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false) + end # Test MATRIX_SIZE_ERROR_HYP wrong_buildmatrix_hyp = (levels, k) -> randn(k, k+1) # Incorrect dimensions for hypothesis matrix - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false) + wrong_size_hyp = (k, k+1) + @test_throws MLJTransforms.MATRIX_SIZE_ERROR_HYP(k, wrong_size_hyp, :A) begin + contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false) + end + end @testset "Dummy Coding Tests" begin diff --git a/test/encoders/missingness_encoding.jl b/test/encoders/missingness_encoding.jl index 4bcc306..8b1c1e8 100644 --- a/test/encoders/missingness_encoding.jl +++ b/test/encoders/missingness_encoding.jl @@ -1,21 +1,26 @@ using MLJTransforms: missingness_encoder_fit, missingness_encoder_transform -@testset "Throws errors when needed" begin - @test_throws ArgumentError begin +@testset "Missingness Encoder Error Handling" begin + # Test COLLISION_NEW_VAL_ME error - when label_for_missing value already exists in levels + @test_throws MLJTransforms.COLLISION_NEW_VAL_ME("missing") begin X = generate_X_with_missingness(;john_name="missing") cache = missingness_encoder_fit( X; label_for_missing = Dict(AbstractString => "missing", Char => 'm'), ) end - @test_throws ArgumentError begin + + # Test VALID_TYPES_NEW_VAL_ME error - when label_for_missing key is not a supported type + @test_throws MLJTransforms.VALID_TYPES_NEW_VAL_ME(Bool) begin X = generate_X_with_missingness() cache = missingness_encoder_fit( X; label_for_missing = Dict(AbstractString => "Other", Bool => 'X'), ) end - @test_throws ArgumentError begin + + # Test UNSPECIFIED_COL_TYPE_ME error - when column type isn't in label_for_missing + @test_throws MLJTransforms.UNSPECIFIED_COL_TYPE_ME(Char, Dict(AbstractString => "X")) begin X = generate_X_with_missingness() cache = missingness_encoder_fit( X; diff --git a/test/encoders/target_encoding.jl b/test/encoders/target_encoding.jl index 4e6b0be..e4fb584 100644 --- a/test/encoders/target_encoding.jl +++ b/test/encoders/target_encoding.jl @@ -333,13 +333,15 @@ end @test fitresult.task == generic_cache[:task] # Test invalid `m` - @test_throws ArgumentError begin - t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 0.5, m = -5) + invalid_m = -5 + @test_throws MLJTransforms.NON_NEGATIVE_m(invalid_m) begin + t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 0.5, m = invalid_m) end - - # Test invalid `lambda` - @test_throws ArgumentError begin - t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 1.1, m = 1) + + # Test invalid `lambda` (value > 1) + invalid_lambda = 1.1 + @test_throws MLJTransforms.INVALID_lambda(invalid_lambda) begin + t = TargetEncoder(ignore = true, ordered_factor = false, lambda = invalid_lambda, m = 1) end # Test report diff --git a/test/generic.jl b/test/generic.jl index ffaa4ae..b08c831 100644 --- a/test/generic.jl +++ b/test/generic.jl @@ -45,7 +45,7 @@ end # Dummy encoder that maps each level to its hash (some arbitrary function) function dummy_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, ) @@ -64,6 +64,7 @@ function dummy_encoder_fit( ) cache = Dict( :hash_given_feat_val => hash_given_feat_val, + :encoded => encoded_features, ) return cache end @@ -144,4 +145,24 @@ end F = [enc(:F, X[:F][i]) for i in 1:10] ) @test X_tr == target +end + +@testset "Callable feature functionality tests" begin + X = dataset_forms[1] + feat_names = Tables.schema(X).names + + # Define a predicate: include only columns with name in uppercase list [:A, :C, :E] + predicate = name -> name in [:A, :C, :E] + + # Test 1: ignore=true should exclude predicate columns + cache1 = dummy_encoder_fit(X, predicate; ignore=true, ordered_factor=false) + @test !(:A in cache1[:encoded]) && !(:C in cache1[:encoded]) && !(:E in cache1[:encoded]) + + # Test 2: ignore=false should include only predicate columns + cache2 = dummy_encoder_fit(X, predicate; ignore=false, ordered_factor=false) + @test Set(cache2[:encoded]) == Set([:A, :C]) + + # Test 3: predicate with ordered_factor=true picks up ordered factors (e.g., :E) + cache3 = dummy_encoder_fit(X, predicate; ignore=false, ordered_factor=true) + @test Set(cache3[:encoded]) == Set([:A, :C, :E]) end \ No newline at end of file diff --git a/test/transformers/cardinality_reducer.jl b/test/transformers/cardinality_reducer.jl index 4763683..40aca18 100644 --- a/test/transformers/cardinality_reducer.jl +++ b/test/transformers/cardinality_reducer.jl @@ -1,30 +1,36 @@ using MLJTransforms: cardinality_reducer_fit, cardinality_reducer_transform - -@testset "Throws errors when needed" begin - @test_throws ArgumentError begin +@testset "Cardinality Reducer Error Handling" begin + # Test COLLISION_NEW_VAL error - when label_for_infrequent value already exists in data + @test_throws MLJTransforms.COLLISION_NEW_VAL('X') begin X = generate_high_cardinality_table(1000; obj = false, special_cat = 'X') cache = cardinality_reducer_fit( X; label_for_infrequent = Dict(AbstractString => "Other", Char => 'X'), ) end - @test_throws ArgumentError begin + + # Test VALID_TYPES_NEW_VAL error - when label_for_infrequent key is not a supported type + @test_throws MLJTransforms.VALID_TYPES_NEW_VAL(Bool) begin X = generate_high_cardinality_table(1000; obj = false, special_cat = 'O') cache = cardinality_reducer_fit( X; label_for_infrequent = Dict(AbstractString => "Other", Bool => 'X'), ) end - @test_throws ArgumentError begin + + # Test UNSPECIFIED_COL_TYPE error - when column type isn't in label_for_infrequent + @test_throws MLJTransforms.UNSPECIFIED_COL_TYPE(Char, Dict(AbstractString => "X")) begin X = generate_high_cardinality_table(1000) cache = cardinality_reducer_fit( X; min_frequency = 30, label_for_infrequent = Dict(AbstractString => "X"), + # Missing Char type in label_for_infrequent, which should be present in X ) end + end From 0075af766c54abd5e7ed2a849473dd480fd596d4 Mon Sep 17 00:00:00 2001 From: Essam Date: Sun, 18 May 2025 16:45:05 -0500 Subject: [PATCH 2/7] Update src/generic.jl Co-authored-by: Anthony Blaom, PhD --- src/generic.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/generic.jl b/src/generic.jl index 3fce0a7..88743c3 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -14,7 +14,7 @@ logic?" # Arguments - X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) - Multiclass or OrderedFactor + `Multiclass` or `OrderedFactor` - features=[]: A list of names of categorical features given as symbols to exclude or include from encoding, or a callable that returns true for features to be included/excluded - ignore=true: Whether to exclude or includes the features given in features From 52727589295ff898e32249414e257f3cdf789d76 Mon Sep 17 00:00:00 2001 From: Essam Date: Sun, 18 May 2025 16:45:21 -0500 Subject: [PATCH 3/7] Update src/generic.jl Co-authored-by: Anthony Blaom, PhD --- src/generic.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/generic.jl b/src/generic.jl index 88743c3..122b1c9 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -16,6 +16,7 @@ logic?" - X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor` - features=[]: A list of names of categorical features given as symbols to exclude or include from encoding, + according to the value of `ignore` or a callable that returns true for features to be included/excluded - ignore=true: Whether to exclude or includes the features given in features - ordered_factor=false: Whether to encode OrderedFactor or ignore them From c784e50babca214f56451ac2d9bda0c2bee45593 Mon Sep 17 00:00:00 2001 From: Essam Date: Sun, 25 May 2025 16:51:00 +0300 Subject: [PATCH 4/7] =?UTF-8?q?=E2=9C=A8=20Attempt=20to=20fix=20broken=20t?= =?UTF-8?q?est?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/CI.yml | 1 - Project.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 64a4f48..7fca62c 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -24,7 +24,6 @@ jobs: matrix: version: - '1.10' - - '1.6' - 'nightly' os: - ubuntu-latest diff --git a/Project.toml b/Project.toml index 5324dab..101d4f4 100644 --- a/Project.toml +++ b/Project.toml @@ -26,7 +26,7 @@ ScientificTypes = "3.0" StatsBase = "0.34" TableOperations = "1.2" Tables = "1.11" -julia = "1.6.7" +julia = "1.6" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" From 194c53e113de19b2343c700369060c5d5b3f2f35 Mon Sep 17 00:00:00 2001 From: Essam Date: Sun, 25 May 2025 17:54:59 +0300 Subject: [PATCH 5/7] =?UTF-8?q?=F0=9F=91=A8=E2=80=8D=F0=9F=94=A7=20Fix=20t?= =?UTF-8?q?ypes=20for=20features?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../contrast_encoder/contrast_encoder.jl | 2 +- .../contrast_encoder/interface_mlj.jl | 8 +- .../frequency_encoding/frequency_encoding.jl | 2 +- .../frequency_encoding/interface_mlj.jl | 4 +- .../missingness_encoding/interface_mlj.jl | 3 +- .../missingness_encoding.jl | 2 +- .../ordinal_encoding/interface_mlj.jl | 4 +- .../ordinal_encoding/ordinal_encoding.jl | 2 +- src/encoders/target_encoding/interface_mlj.jl | 6 +- .../target_encoding/target_encoding.jl | 2 +- src/generic.jl | 44 +- .../cardinality_reducer.jl | 2 +- .../cardinality_reducer/interface_mlj.jl | 3 +- test/encoders/contrast_encoder.jl | 536 +++++++++--------- test/runtests.jl | 10 +- 15 files changed, 319 insertions(+), 311 deletions(-) diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl index b6cdcb0..57bac88 100644 --- a/src/encoders/contrast_encoder/contrast_encoder.jl +++ b/src/encoders/contrast_encoder/contrast_encoder.jl @@ -79,7 +79,7 @@ Fit a contrast encoing scheme on given data in `X`. """ function contrast_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; mode::Union{Symbol, AbstractVector{Symbol}} = :dummy, buildmatrix = nothing, ignore::Bool = true, diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl index 9c098fe..47f9b44 100644 --- a/src/encoders/contrast_encoder/interface_mlj.jl +++ b/src/encoders/contrast_encoder/interface_mlj.jl @@ -1,11 +1,11 @@ ### ContrastEncoding with MLJ Interface # 1. Interface Struct -mutable struct ContrastEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised - features::AS +mutable struct ContrastEncoder{ASS <: Union{Symbol, AbstractVector{Symbol}}, A1 <: Any, A2 <: Any} <: Unsupervised + features::A1 ignore::Bool - mode::Union{Symbol, AS} - buildmatrix::Any + mode:: ASS + buildmatrix::A2 ordered_factor::Bool end; diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl index 39eee4f..312b8f2 100644 --- a/src/encoders/frequency_encoding/frequency_encoding.jl +++ b/src/encoders/frequency_encoding/frequency_encoding.jl @@ -20,7 +20,7 @@ categorical features with their (normalized or raw) frequencies of occurrence in """ function frequency_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, normalize::Bool = false, diff --git a/src/encoders/frequency_encoding/interface_mlj.jl b/src/encoders/frequency_encoding/interface_mlj.jl index 89bd88b..9bad152 100644 --- a/src/encoders/frequency_encoding/interface_mlj.jl +++ b/src/encoders/frequency_encoding/interface_mlj.jl @@ -1,8 +1,8 @@ ### FrequencyEncoding with MLJ Interface # 1. Interface Struct -mutable struct FrequencyEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised - features::AS +mutable struct FrequencyEncoder{A <: Any} <: Unsupervised + features::A ignore::Bool ordered_factor::Bool normalize::Bool diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl index d39228e..85c3802 100644 --- a/src/encoders/missingness_encoding/interface_mlj.jl +++ b/src/encoders/missingness_encoding/interface_mlj.jl @@ -2,11 +2,10 @@ # 1. Interface Struct mutable struct MissingnessEncoder{ - AS <: AbstractVector{Symbol}, T <: Type, A <: Any, } <: Unsupervised - features::AS + features::A ignore::Bool ordered_factor::Bool label_for_missing::Dict{T, A} diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl index 4d3bf2e..e561fd5 100644 --- a/src/encoders/missingness_encoding/missingness_encoding.jl +++ b/src/encoders/missingness_encoding/missingness_encoding.jl @@ -27,7 +27,7 @@ types that are in `Char`, `AbstractString`, and `Number`. """ function missingness_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, label_for_missing::Dict{<:Type, <:Any} = Dict( diff --git a/src/encoders/ordinal_encoding/interface_mlj.jl b/src/encoders/ordinal_encoding/interface_mlj.jl index c6b32cf..a9a79c3 100644 --- a/src/encoders/ordinal_encoding/interface_mlj.jl +++ b/src/encoders/ordinal_encoding/interface_mlj.jl @@ -1,8 +1,8 @@ ### OrdinalEncoding with MLJ Interface # 1. Interface Struct -mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised - features::AS +mutable struct OrdinalEncoder{A <: Any} <: Unsupervised + features::A ignore::Bool ordered_factor::Bool end; diff --git a/src/encoders/ordinal_encoding/ordinal_encoding.jl b/src/encoders/ordinal_encoding/ordinal_encoding.jl index 4afff9d..b4235a6 100644 --- a/src/encoders/ordinal_encoding/ordinal_encoding.jl +++ b/src/encoders/ordinal_encoding/ordinal_encoding.jl @@ -18,7 +18,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as """ function ordinal_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, ) diff --git a/src/encoders/target_encoding/interface_mlj.jl b/src/encoders/target_encoding/interface_mlj.jl index b416b90..dc2fcf0 100644 --- a/src/encoders/target_encoding/interface_mlj.jl +++ b/src/encoders/target_encoding/interface_mlj.jl @@ -1,9 +1,9 @@ ### TargetEncoding with MLJ Interface # 1. Interface Struct -mutable struct TargetEncoder{R1 <: Real, R2 <: Real, AS <: AbstractVector{Symbol}} <: +mutable struct TargetEncoder{R1 <: Real, R2 <: Real, A <: Any} <: Unsupervised - features::AS + features::A ignore::Bool ordered_factor::Bool lambda::R1 @@ -45,7 +45,7 @@ end struct TargetEncoderResult{ I <: Integer, S <: AbstractString, - A <: Any # Useless but likely can't do much better + A <: Any, # Useless but likely can't do much better } <: MMI.MLJType # target statistic for each level of each categorical feature y_stat_given_feat_level::Dict{A, A} diff --git a/src/encoders/target_encoding/target_encoding.jl b/src/encoders/target_encoding/target_encoding.jl index e7fa859..aab4074 100644 --- a/src/encoders/target_encoding/target_encoding.jl +++ b/src/encoders/target_encoding/target_encoding.jl @@ -132,7 +132,7 @@ Fit a target encoder on table X with target y by computing the necessary statist function target_encoder_fit( X, y::AbstractVector, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, lambda::Real = 1.0, diff --git a/src/generic.jl b/src/generic.jl index 122b1c9..fa61803 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -29,7 +29,7 @@ logic?" - encoded_features: The subset of the categorical features of X that were encoded """ function generic_fit(X, - features::Union{AbstractVector{Symbol}, Function} = Symbol[], + features = Symbol[], args...; ignore::Bool = true, ordered_factor::Bool = false, @@ -49,7 +49,8 @@ function generic_fit(X, end else # Original behavior for vector of symbols - feat_names = (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) + feat_names = + (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) end # 3. Define mapping per column per level dictionary @@ -61,11 +62,13 @@ function generic_fit(X, feat_col = Tables.getcolumn(X, feat_name) feat_type = elscitype(feat_col) feat_has_allowed_type = - feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor}) + feat_type <: Union{Missing, Multiclass} || + (ordered_factor && feat_type <: Union{Missing, OrderedFactor}) if feat_has_allowed_type # then should be encoded push!(encoded_features, feat_name) # Compute the dict using the given feature_mapper function - mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...) + mapping_per_feat_level[feat_name] = + feature_mapper(feat_col, feat_name, args...; kwargs...) end end return mapping_per_feat_level, encoded_features @@ -84,7 +87,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names) new_column_names = [] while conflict - suffix = repeat("_", count) + suffix = repeat("_", count) new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds] conflict = any(name -> name in existing_names, new_column_names) count += 1 @@ -97,22 +100,27 @@ end """ **Private method.** -Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in +Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in a subset of categorical features of X into a scalar or a vector (as specified in single_feat) - - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` - into a scalar (single_feat=true) + - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` + into a scalar (single_feat=true) - - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` - into a set of k features where k is the length of the vector (single_feat=false) + - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` + into a set of k features where k is the length of the vector (single_feat=false) - In both cases it attempts to preserve the type of the table. - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such - assumption is necessary because any column in X must correspond to a constant number of features + assumption is necessary because any column in X must correspond to a constant number of features in the output table (which is equal to k). - Features not in the dictionary are mapped to themselves (i.e., not changed). - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error. """ -function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false) +function generic_transform( + X, + mapping_per_feat_level; + single_feat = true, + ignore_unknown = false, +) feat_names = Tables.schema(X).names new_feat_names = Symbol[] new_cols = [] @@ -127,10 +135,12 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore if !issubset(test_levels, train_levels) # get the levels in test that are not in train lost_levels = setdiff(test_levels, train_levels) - error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.") + error( + "While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.", + ) end end - + if single_feat level2scalar = mapping_per_feat_level[feat_name] new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col @@ -138,7 +148,7 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore push!(new_feat_names, feat_name) else level2vector = mapping_per_feat_level[feat_name] - new_multi_col = map(x->get(level2vector, x, x), col) + new_multi_col = map(x -> get(level2vector, x, x), col) new_multi_col = [col for col in eachrow(hcat(new_multi_col...))] push!(new_cols, new_multi_col...) @@ -156,8 +166,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore end end - transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...) + transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...) # Attempt to preserve table type transformed_X = Tables.materializer(X)(transformed_X) return transformed_X -end \ No newline at end of file +end diff --git a/src/transformers/cardinality_reducer/cardinality_reducer.jl b/src/transformers/cardinality_reducer/cardinality_reducer.jl index 18ca84c..7626f21 100644 --- a/src/transformers/cardinality_reducer/cardinality_reducer.jl +++ b/src/transformers/cardinality_reducer/cardinality_reducer.jl @@ -32,7 +32,7 @@ types that are in `Char`, `AbstractString`, and `Number`. """ function cardinality_reducer_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, min_frequency::Real = 3, diff --git a/src/transformers/cardinality_reducer/interface_mlj.jl b/src/transformers/cardinality_reducer/interface_mlj.jl index 31af464..3ad7a65 100644 --- a/src/transformers/cardinality_reducer/interface_mlj.jl +++ b/src/transformers/cardinality_reducer/interface_mlj.jl @@ -2,12 +2,11 @@ # 1. Interface Struct mutable struct CardinalityReducer{ - AS <: AbstractVector{Symbol}, R <: Real, T <: Type, A <: Any, } <: Unsupervised - features::AS + features::A ignore::Bool ordered_factor::Bool min_frequency::R diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl index c47b46e..bed776d 100644 --- a/test/encoders/contrast_encoder.jl +++ b/test/encoders/contrast_encoder.jl @@ -8,278 +8,278 @@ favnum = categorical([7, 5, 10, 1]), age = [23, 23, 14, 23]) -@testset "Contrast Encoder Error Handling" begin - # Example definitions to allow the test to run - function dummy_buildmatrix(colname, k) - # Simple dummy function to generate a matrix of correct size - return randn(k, k-1) # Adjust dimensions as needed for the test - end - - # Define a DataFrame or appropriate data structure to test with - data = DataFrame( - A = categorical(["level1", "level2", "level3"]), - B = categorical(["levelX", "levelY", "levelZ"]), - ) - - # Test IGNORE_MUST_FALSE_VEC_MODE error - @test_throws MLJTransforms.IGNORE_MUST_FALSE_VEC_MODE begin - contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true) - end - - # Test LENGTH_MISMATCH_VEC_MODE error - @test_throws MLJTransforms.LENGTH_MISMATCH_VEC_MODE(2, 1) begin - contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false) - end - - # Test BUILDFUNC_MUST_BE_SPECIFIED error - @test_throws MLJTransforms.BUILDFUNC_MUST_BE_SPECIFIED begin - contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false) - end - - # Test MATRIX_SIZE_ERROR - wrong_buildmatrix = (levels, k) -> randn(k, k) # Incorrect dimensions - k = 3 # Number of levels in data[:A] - wrong_size = (k, k) - @test_throws MLJTransforms.MATRIX_SIZE_ERROR(k, wrong_size, :A) begin - contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false) - end - - # Test MATRIX_SIZE_ERROR_HYP - wrong_buildmatrix_hyp = (levels, k) -> randn(k, k+1) # Incorrect dimensions for hypothesis matrix - wrong_size_hyp = (k, k+1) - @test_throws MLJTransforms.MATRIX_SIZE_ERROR_HYP(k, wrong_size_hyp, :A) begin - contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false) - end +# @testset "Contrast Encoder Error Handling" begin +# # Example definitions to allow the test to run +# function dummy_buildmatrix(colname, k) +# # Simple dummy function to generate a matrix of correct size +# return randn(k, k-1) # Adjust dimensions as needed for the test +# end + +# # Define a DataFrame or appropriate data structure to test with +# data = DataFrame( +# A = categorical(["level1", "level2", "level3"]), +# B = categorical(["levelX", "levelY", "levelZ"]), +# ) + +# # Test IGNORE_MUST_FALSE_VEC_MODE error +# @test_throws MLJTransforms.IGNORE_MUST_FALSE_VEC_MODE begin +# contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true) +# end + +# # Test LENGTH_MISMATCH_VEC_MODE error +# @test_throws MLJTransforms.LENGTH_MISMATCH_VEC_MODE(2, 1) begin +# contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false) +# end + +# # Test BUILDFUNC_MUST_BE_SPECIFIED error +# @test_throws MLJTransforms.BUILDFUNC_MUST_BE_SPECIFIED begin +# contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false) +# end + +# # Test MATRIX_SIZE_ERROR +# wrong_buildmatrix = (levels, k) -> randn(k, k) # Incorrect dimensions +# k = 3 # Number of levels in data[:A] +# wrong_size = (k, k) +# @test_throws MLJTransforms.MATRIX_SIZE_ERROR(k, wrong_size, :A) begin +# contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false) +# end + +# # Test MATRIX_SIZE_ERROR_HYP +# wrong_buildmatrix_hyp = (levels, k) -> randn(k, k+1) # Incorrect dimensions for hypothesis matrix +# wrong_size_hyp = (k, k+1) +# @test_throws MLJTransforms.MATRIX_SIZE_ERROR_HYP(k, wrong_size_hyp, :A) begin +# contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false) +# end -end - -@testset "Dummy Coding Tests" begin - for k in 2:5 # Testing for various numbers of levels - contrast_matrix = get_dummy_contrast(k) - expected_matrix = Matrix(1.0I, k, k-1) - @test contrast_matrix == expected_matrix - @test size(contrast_matrix) == (k, k-1) - end - # test that fit is correct for dummy Coding - cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy) - k = length(levels(X.name)) - contrast_matrix = get_dummy_contrast(k) - print() - for (i, level) in enumerate(levels(X.name)) - println(cache[:vector_given_value_given_feature]) - @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] - end -end - - -@testset "Sum Coding Tests" begin - # Manually define the expected matrix for a 4-level categorical variable - expected_matrix_4 = [1.0 0.0 0.0; - 0.0 1.0 0.0; - 0.0 0.0 1.0; - -1.0 -1.0 -1.0] # Sum of each column for the first three rows is zeroed by the last row - contrast_matrix_4 = get_sum_contrast(4) - @test contrast_matrix_4 == expected_matrix_4 - @test size(contrast_matrix_4) == (4, 3) - - # Additional tests can be included for different levels, with each matrix defined manually - # Example for 3 levels - expected_matrix_3 = [1.0 0.0; - 0.0 1.0; - -1.0 -1.0] - contrast_matrix_3 = get_sum_contrast(3) - @test contrast_matrix_3 == expected_matrix_3 - @test size(contrast_matrix_3) == (3, 2) - # test that fit is correct for sum Coding - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :sum) - k = length(levels(X.favnum)) - contrast_matrix = get_sum_contrast(k) - for (i, level) in enumerate(levels(X.favnum)) - @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] - end -end - -@testset "Backward Difference Coding Tests" begin - # Manually define the expected matrix for a 4 level categorical variable - expected_matrix_4 = [-0.75 -0.5 -0.25; - 0.25 -0.5 -0.25; - 0.25 0.5 -0.25; - 0.25 0.5 0.75] - contrast_matrix_4 = get_backward_diff_contrast(4) - @test contrast_matrix_4 == expected_matrix_4 - @test size(contrast_matrix_4) == (4, 3) - - # Test that fit is correct for backward Coding - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :backward_diff) - k = length(levels(X.favnum)) - contrast_matrix = get_backward_diff_contrast(k) - for (i, level) in enumerate(levels(X.favnum)) - @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] - end -end - -@testset "Forward Difference Coding Tests" begin - for k in 2:5 - backward_matrix = get_backward_diff_contrast(k) - forward_matrix = get_forward_diff_contrast(k) - @test forward_matrix == -backward_matrix - @test size(forward_matrix) == (k, k-1) - end - - # Test that fit is correct for forward Coding - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :forward_diff) - k = length(levels(X.favnum)) - contrast_matrix = get_forward_diff_contrast(k) - for (i, level) in enumerate(levels(X.favnum)) - @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] - end -end - -@testset "helmert_vector function tests" begin - @test create_helmert_vector(1, 5) == [-1.0, 1.0, 0.0, 0.0, 0.0] - @test create_helmert_vector(2, 5) == [-1.0, -1.0, 2.0, 0.0, 0.0] - @test create_helmert_vector(3, 5) == [-1.0, -1.0, -1.0, 3.0, 0.0] - @test create_helmert_vector(4, 5) == [-1.0, -1.0, -1.0, -1.0, 4.0] - @test create_helmert_vector(1, 3) == [-1.0, 1.0, 0.0] - @test create_helmert_vector(2, 3) == [-1.0, -1.0, 2.0] - k = 4 - @test get_helmert_contrast(k) == [ - -1.0 -1.0 -1.0 - 1.0 -1.0 -1.0 - 0.0 2.0 -1.0 - 0.0 0.0 3.0] - # test that fit is correct for helmert Coding - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :helmert) - k = length(levels(X.name)) - contrast_matrix = get_helmert_contrast(k) - for (i, level) in enumerate(levels(X.name)) - @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] - end -end - - -# @testset "contrast matrix end-to-end test" -@testset "contrast mode end-to-end test" begin - - - function buildrandomcontrast(colname, k) - return rand(StableRNGs.StableRNG(123), k, k-1) - end - - cache = contrast_encoder_fit(X; mode=:contrast, buildmatrix=buildrandomcontrast) - - X_tr = contrast_encoder_transform(X, cache) - X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] - - - df = DataFrame(X) - - mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 3)), - :favnum=> StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 4)) - )) - - X_tr_sm = ModelMatrix(mf).m[:, 2:end] - - @test X_tr_mlj == X_tr_sm -end - -# @testset "hypothesis matrix end-to-end test" -@testset "hypothesis mode end-to-end test" begin - - function buildrandomhypothesis(colname, k) - return rand(StableRNGs.StableRNG(123), k-1, k) - end - - cache = contrast_encoder_fit(X; mode=:hypothesis, buildmatrix=buildrandomhypothesis) - X_tr = contrast_encoder_transform(X, cache) - X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] - df = DataFrame(X) - - mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => HypothesisCoding(buildrandomhypothesis(nothing, 3); levels=levels(X.name), labels=[]), - :favnum=> HypothesisCoding(buildrandomhypothesis(nothing, 4); levels=levels(X.favnum), labels=[]) - )) - - X_tr_sm = ModelMatrix(mf).m[:, 2:end] - - @test X_tr_mlj == X_tr_sm -end - - -function buildrandomhypothesis(colname, k) - return rand(StableRNGs.StableRNG(123), k-1, k) -end - -function buildrandomcontrast(colname, k) - return rand(StableRNGs.StableRNG(123), k, k-1) -end - -@testset "single-mode end-to-end test with StatsModels" begin - # test end-to-end single_column transformations - for ind in 1:6 - stats_models(k, ind) = [ - StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), - DummyCoding(; base=(k == 3) ? "Mary" : 10), - EffectsCoding(; base=(k == 3) ? "Mary" : 10), - SeqDiffCoding(), - HelmertCoding(), - HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), - ][ind] - modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] - matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] - - # Try MLJTransforms - cache = contrast_encoder_fit(X; mode=modes[ind], buildmatrix=matrix_func[ind]) - X_tr = contrast_encoder_transform(X, cache) - - df = DataFrame(X) - - mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => stats_models(3, ind), - :favnum=> stats_models(4, ind), - )) - - X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] - X_tr_sm = ModelMatrix(mf).m[:, 2:end] - @test X_tr_mlj ≈ X_tr_sm - end -end - -@testset "multi-mode end-to-end test with StatsModels" begin - # test end-to-end single_column transformations - for ind1 in 1:6 - for ind2 in 2:5 - stats_models(k, ind) = [ - StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), - DummyCoding(; base=(k == 3) ? "Mary" : 10), - EffectsCoding(; base=(k == 3) ? "Mary" : 10), - SeqDiffCoding(), - HelmertCoding(), - HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), - ][ind] +# end + +# @testset "Dummy Coding Tests" begin +# for k in 2:5 # Testing for various numbers of levels +# contrast_matrix = get_dummy_contrast(k) +# expected_matrix = Matrix(1.0I, k, k-1) +# @test contrast_matrix == expected_matrix +# @test size(contrast_matrix) == (k, k-1) +# end +# # test that fit is correct for dummy Coding +# cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy) +# k = length(levels(X.name)) +# contrast_matrix = get_dummy_contrast(k) +# print() +# for (i, level) in enumerate(levels(X.name)) +# println(cache[:vector_given_value_given_feature]) +# @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] +# end +# end + + +# @testset "Sum Coding Tests" begin +# # Manually define the expected matrix for a 4-level categorical variable +# expected_matrix_4 = [1.0 0.0 0.0; +# 0.0 1.0 0.0; +# 0.0 0.0 1.0; +# -1.0 -1.0 -1.0] # Sum of each column for the first three rows is zeroed by the last row +# contrast_matrix_4 = get_sum_contrast(4) +# @test contrast_matrix_4 == expected_matrix_4 +# @test size(contrast_matrix_4) == (4, 3) + +# # Additional tests can be included for different levels, with each matrix defined manually +# # Example for 3 levels +# expected_matrix_3 = [1.0 0.0; +# 0.0 1.0; +# -1.0 -1.0] +# contrast_matrix_3 = get_sum_contrast(3) +# @test contrast_matrix_3 == expected_matrix_3 +# @test size(contrast_matrix_3) == (3, 2) +# # test that fit is correct for sum Coding +# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :sum) +# k = length(levels(X.favnum)) +# contrast_matrix = get_sum_contrast(k) +# for (i, level) in enumerate(levels(X.favnum)) +# @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] +# end +# end + +# @testset "Backward Difference Coding Tests" begin +# # Manually define the expected matrix for a 4 level categorical variable +# expected_matrix_4 = [-0.75 -0.5 -0.25; +# 0.25 -0.5 -0.25; +# 0.25 0.5 -0.25; +# 0.25 0.5 0.75] +# contrast_matrix_4 = get_backward_diff_contrast(4) +# @test contrast_matrix_4 == expected_matrix_4 +# @test size(contrast_matrix_4) == (4, 3) + +# # Test that fit is correct for backward Coding +# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :backward_diff) +# k = length(levels(X.favnum)) +# contrast_matrix = get_backward_diff_contrast(k) +# for (i, level) in enumerate(levels(X.favnum)) +# @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] +# end +# end + +# @testset "Forward Difference Coding Tests" begin +# for k in 2:5 +# backward_matrix = get_backward_diff_contrast(k) +# forward_matrix = get_forward_diff_contrast(k) +# @test forward_matrix == -backward_matrix +# @test size(forward_matrix) == (k, k-1) +# end + +# # Test that fit is correct for forward Coding +# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :forward_diff) +# k = length(levels(X.favnum)) +# contrast_matrix = get_forward_diff_contrast(k) +# for (i, level) in enumerate(levels(X.favnum)) +# @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] +# end +# end + +# @testset "helmert_vector function tests" begin +# @test create_helmert_vector(1, 5) == [-1.0, 1.0, 0.0, 0.0, 0.0] +# @test create_helmert_vector(2, 5) == [-1.0, -1.0, 2.0, 0.0, 0.0] +# @test create_helmert_vector(3, 5) == [-1.0, -1.0, -1.0, 3.0, 0.0] +# @test create_helmert_vector(4, 5) == [-1.0, -1.0, -1.0, -1.0, 4.0] +# @test create_helmert_vector(1, 3) == [-1.0, 1.0, 0.0] +# @test create_helmert_vector(2, 3) == [-1.0, -1.0, 2.0] +# k = 4 +# @test get_helmert_contrast(k) == [ +# -1.0 -1.0 -1.0 +# 1.0 -1.0 -1.0 +# 0.0 2.0 -1.0 +# 0.0 0.0 3.0] +# # test that fit is correct for helmert Coding +# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :helmert) +# k = length(levels(X.name)) +# contrast_matrix = get_helmert_contrast(k) +# for (i, level) in enumerate(levels(X.name)) +# @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] +# end +# end + + +# # @testset "contrast matrix end-to-end test" +# @testset "contrast mode end-to-end test" begin + + +# function buildrandomcontrast(colname, k) +# return rand(StableRNGs.StableRNG(123), k, k-1) +# end + +# cache = contrast_encoder_fit(X; mode=:contrast, buildmatrix=buildrandomcontrast) + +# X_tr = contrast_encoder_transform(X, cache) +# X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] + + +# df = DataFrame(X) + +# mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( +# :name => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 3)), +# :favnum=> StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 4)) +# )) + +# X_tr_sm = ModelMatrix(mf).m[:, 2:end] + +# @test X_tr_mlj == X_tr_sm +# end + +# # @testset "hypothesis matrix end-to-end test" +# @testset "hypothesis mode end-to-end test" begin + +# function buildrandomhypothesis(colname, k) +# return rand(StableRNGs.StableRNG(123), k-1, k) +# end + +# cache = contrast_encoder_fit(X; mode=:hypothesis, buildmatrix=buildrandomhypothesis) +# X_tr = contrast_encoder_transform(X, cache) +# X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] +# df = DataFrame(X) + +# mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( +# :name => HypothesisCoding(buildrandomhypothesis(nothing, 3); levels=levels(X.name), labels=[]), +# :favnum=> HypothesisCoding(buildrandomhypothesis(nothing, 4); levels=levels(X.favnum), labels=[]) +# )) + +# X_tr_sm = ModelMatrix(mf).m[:, 2:end] + +# @test X_tr_mlj == X_tr_sm +# end + + +# function buildrandomhypothesis(colname, k) +# return rand(StableRNGs.StableRNG(123), k-1, k) +# end + +# function buildrandomcontrast(colname, k) +# return rand(StableRNGs.StableRNG(123), k, k-1) +# end + +# @testset "single-mode end-to-end test with StatsModels" begin +# # test end-to-end single_column transformations +# for ind in 1:6 +# stats_models(k, ind) = [ +# StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), +# DummyCoding(; base=(k == 3) ? "Mary" : 10), +# EffectsCoding(; base=(k == 3) ? "Mary" : 10), +# SeqDiffCoding(), +# HelmertCoding(), +# HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), +# ][ind] +# modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] +# matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] + +# # Try MLJTransforms +# cache = contrast_encoder_fit(X; mode=modes[ind], buildmatrix=matrix_func[ind]) +# X_tr = contrast_encoder_transform(X, cache) + +# df = DataFrame(X) + +# mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( +# :name => stats_models(3, ind), +# :favnum=> stats_models(4, ind), +# )) + +# X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] +# X_tr_sm = ModelMatrix(mf).m[:, 2:end] +# @test X_tr_mlj ≈ X_tr_sm +# end +# end + +# @testset "multi-mode end-to-end test with StatsModels" begin +# # test end-to-end single_column transformations +# for ind1 in 1:6 +# for ind2 in 2:5 +# stats_models(k, ind) = [ +# StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), +# DummyCoding(; base=(k == 3) ? "Mary" : 10), +# EffectsCoding(; base=(k == 3) ? "Mary" : 10), +# SeqDiffCoding(), +# HelmertCoding(), +# HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), +# ][ind] - modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] - matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] +# modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] +# matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] - # Try MLJTransforms - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode=[modes[ind1], modes[ind2]], buildmatrix=matrix_func[ind1]) - X_tr = contrast_encoder_transform(X, cache) +# # Try MLJTransforms +# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode=[modes[ind1], modes[ind2]], buildmatrix=matrix_func[ind1]) +# X_tr = contrast_encoder_transform(X, cache) - df = DataFrame(X) - - mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => stats_models(3, ind1), - :favnum=> stats_models(4, ind2), - )) - - X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] - X_tr_sm = ModelMatrix(mf).m[:, 2:end] - - @test X_tr_mlj ≈ X_tr_sm - end - end -end +# df = DataFrame(X) + +# mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( +# :name => stats_models(3, ind1), +# :favnum=> stats_models(4, ind2), +# )) + +# X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] +# X_tr_sm = ModelMatrix(mf).m[:, 2:end] + +# @test X_tr_mlj ≈ X_tr_sm +# end +# end +# end diff --git a/test/runtests.jl b/test/runtests.jl index b867435..8cf6385 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -25,11 +25,11 @@ _get(x) = CategoricalArrays.DataAPI.unwrap(x) include("utils.jl") include("generic.jl") -include("encoders/target_encoding.jl") -include("encoders/ordinal_encoding.jl") -include("encoders/frequency_encoder.jl") -include("transformers/cardinality_reducer.jl") -include("encoders/missingness_encoding.jl") +# include("encoders/target_encoding.jl") +# include("encoders/ordinal_encoding.jl") +# include("encoders/frequency_encoder.jl") +# include("transformers/cardinality_reducer.jl") +# include("encoders/missingness_encoding.jl") include("encoders/contrast_encoder.jl") # Other transformers From 5e0af90ff5b51c9f718ccf27003c381a7506eed1 Mon Sep 17 00:00:00 2001 From: Essam Date: Sun, 25 May 2025 18:00:06 +0300 Subject: [PATCH 6/7] =?UTF-8?q?=E2=9C=A8=20Better=20callable=20features=20?= =?UTF-8?q?logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/generic.jl | 10 +- test/encoders/contrast_encoder.jl | 536 +++++++++++++++--------------- test/runtests.jl | 10 +- 3 files changed, 278 insertions(+), 278 deletions(-) diff --git a/src/generic.jl b/src/generic.jl index fa61803..8749964 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -40,17 +40,17 @@ function generic_fit(X, feat_names = Tables.schema(X).names #2. Modify column_names based on features - if features isa Function + if features isa AbstractVector{Symbol} + # Original behavior for vector of symbols + feat_names = + (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) + else # If features is a callable, apply it to each feature name if ignore feat_names = filter(name -> !features(name), feat_names) else feat_names = filter(features, feat_names) end - else - # Original behavior for vector of symbols - feat_names = - (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) end # 3. Define mapping per column per level dictionary diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl index bed776d..c47b46e 100644 --- a/test/encoders/contrast_encoder.jl +++ b/test/encoders/contrast_encoder.jl @@ -8,278 +8,278 @@ favnum = categorical([7, 5, 10, 1]), age = [23, 23, 14, 23]) -# @testset "Contrast Encoder Error Handling" begin -# # Example definitions to allow the test to run -# function dummy_buildmatrix(colname, k) -# # Simple dummy function to generate a matrix of correct size -# return randn(k, k-1) # Adjust dimensions as needed for the test -# end - -# # Define a DataFrame or appropriate data structure to test with -# data = DataFrame( -# A = categorical(["level1", "level2", "level3"]), -# B = categorical(["levelX", "levelY", "levelZ"]), -# ) - -# # Test IGNORE_MUST_FALSE_VEC_MODE error -# @test_throws MLJTransforms.IGNORE_MUST_FALSE_VEC_MODE begin -# contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true) -# end - -# # Test LENGTH_MISMATCH_VEC_MODE error -# @test_throws MLJTransforms.LENGTH_MISMATCH_VEC_MODE(2, 1) begin -# contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false) -# end - -# # Test BUILDFUNC_MUST_BE_SPECIFIED error -# @test_throws MLJTransforms.BUILDFUNC_MUST_BE_SPECIFIED begin -# contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false) -# end - -# # Test MATRIX_SIZE_ERROR -# wrong_buildmatrix = (levels, k) -> randn(k, k) # Incorrect dimensions -# k = 3 # Number of levels in data[:A] -# wrong_size = (k, k) -# @test_throws MLJTransforms.MATRIX_SIZE_ERROR(k, wrong_size, :A) begin -# contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false) -# end - -# # Test MATRIX_SIZE_ERROR_HYP -# wrong_buildmatrix_hyp = (levels, k) -> randn(k, k+1) # Incorrect dimensions for hypothesis matrix -# wrong_size_hyp = (k, k+1) -# @test_throws MLJTransforms.MATRIX_SIZE_ERROR_HYP(k, wrong_size_hyp, :A) begin -# contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false) -# end +@testset "Contrast Encoder Error Handling" begin + # Example definitions to allow the test to run + function dummy_buildmatrix(colname, k) + # Simple dummy function to generate a matrix of correct size + return randn(k, k-1) # Adjust dimensions as needed for the test + end + + # Define a DataFrame or appropriate data structure to test with + data = DataFrame( + A = categorical(["level1", "level2", "level3"]), + B = categorical(["levelX", "levelY", "levelZ"]), + ) + + # Test IGNORE_MUST_FALSE_VEC_MODE error + @test_throws MLJTransforms.IGNORE_MUST_FALSE_VEC_MODE begin + contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true) + end + + # Test LENGTH_MISMATCH_VEC_MODE error + @test_throws MLJTransforms.LENGTH_MISMATCH_VEC_MODE(2, 1) begin + contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false) + end + + # Test BUILDFUNC_MUST_BE_SPECIFIED error + @test_throws MLJTransforms.BUILDFUNC_MUST_BE_SPECIFIED begin + contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false) + end + + # Test MATRIX_SIZE_ERROR + wrong_buildmatrix = (levels, k) -> randn(k, k) # Incorrect dimensions + k = 3 # Number of levels in data[:A] + wrong_size = (k, k) + @test_throws MLJTransforms.MATRIX_SIZE_ERROR(k, wrong_size, :A) begin + contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false) + end + + # Test MATRIX_SIZE_ERROR_HYP + wrong_buildmatrix_hyp = (levels, k) -> randn(k, k+1) # Incorrect dimensions for hypothesis matrix + wrong_size_hyp = (k, k+1) + @test_throws MLJTransforms.MATRIX_SIZE_ERROR_HYP(k, wrong_size_hyp, :A) begin + contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false) + end -# end - -# @testset "Dummy Coding Tests" begin -# for k in 2:5 # Testing for various numbers of levels -# contrast_matrix = get_dummy_contrast(k) -# expected_matrix = Matrix(1.0I, k, k-1) -# @test contrast_matrix == expected_matrix -# @test size(contrast_matrix) == (k, k-1) -# end -# # test that fit is correct for dummy Coding -# cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy) -# k = length(levels(X.name)) -# contrast_matrix = get_dummy_contrast(k) -# print() -# for (i, level) in enumerate(levels(X.name)) -# println(cache[:vector_given_value_given_feature]) -# @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] -# end -# end - - -# @testset "Sum Coding Tests" begin -# # Manually define the expected matrix for a 4-level categorical variable -# expected_matrix_4 = [1.0 0.0 0.0; -# 0.0 1.0 0.0; -# 0.0 0.0 1.0; -# -1.0 -1.0 -1.0] # Sum of each column for the first three rows is zeroed by the last row -# contrast_matrix_4 = get_sum_contrast(4) -# @test contrast_matrix_4 == expected_matrix_4 -# @test size(contrast_matrix_4) == (4, 3) - -# # Additional tests can be included for different levels, with each matrix defined manually -# # Example for 3 levels -# expected_matrix_3 = [1.0 0.0; -# 0.0 1.0; -# -1.0 -1.0] -# contrast_matrix_3 = get_sum_contrast(3) -# @test contrast_matrix_3 == expected_matrix_3 -# @test size(contrast_matrix_3) == (3, 2) -# # test that fit is correct for sum Coding -# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :sum) -# k = length(levels(X.favnum)) -# contrast_matrix = get_sum_contrast(k) -# for (i, level) in enumerate(levels(X.favnum)) -# @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] -# end -# end - -# @testset "Backward Difference Coding Tests" begin -# # Manually define the expected matrix for a 4 level categorical variable -# expected_matrix_4 = [-0.75 -0.5 -0.25; -# 0.25 -0.5 -0.25; -# 0.25 0.5 -0.25; -# 0.25 0.5 0.75] -# contrast_matrix_4 = get_backward_diff_contrast(4) -# @test contrast_matrix_4 == expected_matrix_4 -# @test size(contrast_matrix_4) == (4, 3) - -# # Test that fit is correct for backward Coding -# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :backward_diff) -# k = length(levels(X.favnum)) -# contrast_matrix = get_backward_diff_contrast(k) -# for (i, level) in enumerate(levels(X.favnum)) -# @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] -# end -# end - -# @testset "Forward Difference Coding Tests" begin -# for k in 2:5 -# backward_matrix = get_backward_diff_contrast(k) -# forward_matrix = get_forward_diff_contrast(k) -# @test forward_matrix == -backward_matrix -# @test size(forward_matrix) == (k, k-1) -# end - -# # Test that fit is correct for forward Coding -# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :forward_diff) -# k = length(levels(X.favnum)) -# contrast_matrix = get_forward_diff_contrast(k) -# for (i, level) in enumerate(levels(X.favnum)) -# @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] -# end -# end - -# @testset "helmert_vector function tests" begin -# @test create_helmert_vector(1, 5) == [-1.0, 1.0, 0.0, 0.0, 0.0] -# @test create_helmert_vector(2, 5) == [-1.0, -1.0, 2.0, 0.0, 0.0] -# @test create_helmert_vector(3, 5) == [-1.0, -1.0, -1.0, 3.0, 0.0] -# @test create_helmert_vector(4, 5) == [-1.0, -1.0, -1.0, -1.0, 4.0] -# @test create_helmert_vector(1, 3) == [-1.0, 1.0, 0.0] -# @test create_helmert_vector(2, 3) == [-1.0, -1.0, 2.0] -# k = 4 -# @test get_helmert_contrast(k) == [ -# -1.0 -1.0 -1.0 -# 1.0 -1.0 -1.0 -# 0.0 2.0 -1.0 -# 0.0 0.0 3.0] -# # test that fit is correct for helmert Coding -# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :helmert) -# k = length(levels(X.name)) -# contrast_matrix = get_helmert_contrast(k) -# for (i, level) in enumerate(levels(X.name)) -# @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] -# end -# end - - -# # @testset "contrast matrix end-to-end test" -# @testset "contrast mode end-to-end test" begin - - -# function buildrandomcontrast(colname, k) -# return rand(StableRNGs.StableRNG(123), k, k-1) -# end - -# cache = contrast_encoder_fit(X; mode=:contrast, buildmatrix=buildrandomcontrast) - -# X_tr = contrast_encoder_transform(X, cache) -# X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] - - -# df = DataFrame(X) - -# mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( -# :name => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 3)), -# :favnum=> StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 4)) -# )) - -# X_tr_sm = ModelMatrix(mf).m[:, 2:end] - -# @test X_tr_mlj == X_tr_sm -# end - -# # @testset "hypothesis matrix end-to-end test" -# @testset "hypothesis mode end-to-end test" begin - -# function buildrandomhypothesis(colname, k) -# return rand(StableRNGs.StableRNG(123), k-1, k) -# end - -# cache = contrast_encoder_fit(X; mode=:hypothesis, buildmatrix=buildrandomhypothesis) -# X_tr = contrast_encoder_transform(X, cache) -# X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] -# df = DataFrame(X) - -# mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( -# :name => HypothesisCoding(buildrandomhypothesis(nothing, 3); levels=levels(X.name), labels=[]), -# :favnum=> HypothesisCoding(buildrandomhypothesis(nothing, 4); levels=levels(X.favnum), labels=[]) -# )) - -# X_tr_sm = ModelMatrix(mf).m[:, 2:end] - -# @test X_tr_mlj == X_tr_sm -# end - - -# function buildrandomhypothesis(colname, k) -# return rand(StableRNGs.StableRNG(123), k-1, k) -# end - -# function buildrandomcontrast(colname, k) -# return rand(StableRNGs.StableRNG(123), k, k-1) -# end - -# @testset "single-mode end-to-end test with StatsModels" begin -# # test end-to-end single_column transformations -# for ind in 1:6 -# stats_models(k, ind) = [ -# StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), -# DummyCoding(; base=(k == 3) ? "Mary" : 10), -# EffectsCoding(; base=(k == 3) ? "Mary" : 10), -# SeqDiffCoding(), -# HelmertCoding(), -# HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), -# ][ind] -# modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] -# matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] - -# # Try MLJTransforms -# cache = contrast_encoder_fit(X; mode=modes[ind], buildmatrix=matrix_func[ind]) -# X_tr = contrast_encoder_transform(X, cache) - -# df = DataFrame(X) - -# mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( -# :name => stats_models(3, ind), -# :favnum=> stats_models(4, ind), -# )) - -# X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] -# X_tr_sm = ModelMatrix(mf).m[:, 2:end] -# @test X_tr_mlj ≈ X_tr_sm -# end -# end - -# @testset "multi-mode end-to-end test with StatsModels" begin -# # test end-to-end single_column transformations -# for ind1 in 1:6 -# for ind2 in 2:5 -# stats_models(k, ind) = [ -# StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), -# DummyCoding(; base=(k == 3) ? "Mary" : 10), -# EffectsCoding(; base=(k == 3) ? "Mary" : 10), -# SeqDiffCoding(), -# HelmertCoding(), -# HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), -# ][ind] +end + +@testset "Dummy Coding Tests" begin + for k in 2:5 # Testing for various numbers of levels + contrast_matrix = get_dummy_contrast(k) + expected_matrix = Matrix(1.0I, k, k-1) + @test contrast_matrix == expected_matrix + @test size(contrast_matrix) == (k, k-1) + end + # test that fit is correct for dummy Coding + cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy) + k = length(levels(X.name)) + contrast_matrix = get_dummy_contrast(k) + print() + for (i, level) in enumerate(levels(X.name)) + println(cache[:vector_given_value_given_feature]) + @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] + end +end + + +@testset "Sum Coding Tests" begin + # Manually define the expected matrix for a 4-level categorical variable + expected_matrix_4 = [1.0 0.0 0.0; + 0.0 1.0 0.0; + 0.0 0.0 1.0; + -1.0 -1.0 -1.0] # Sum of each column for the first three rows is zeroed by the last row + contrast_matrix_4 = get_sum_contrast(4) + @test contrast_matrix_4 == expected_matrix_4 + @test size(contrast_matrix_4) == (4, 3) + + # Additional tests can be included for different levels, with each matrix defined manually + # Example for 3 levels + expected_matrix_3 = [1.0 0.0; + 0.0 1.0; + -1.0 -1.0] + contrast_matrix_3 = get_sum_contrast(3) + @test contrast_matrix_3 == expected_matrix_3 + @test size(contrast_matrix_3) == (3, 2) + # test that fit is correct for sum Coding + cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :sum) + k = length(levels(X.favnum)) + contrast_matrix = get_sum_contrast(k) + for (i, level) in enumerate(levels(X.favnum)) + @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] + end +end + +@testset "Backward Difference Coding Tests" begin + # Manually define the expected matrix for a 4 level categorical variable + expected_matrix_4 = [-0.75 -0.5 -0.25; + 0.25 -0.5 -0.25; + 0.25 0.5 -0.25; + 0.25 0.5 0.75] + contrast_matrix_4 = get_backward_diff_contrast(4) + @test contrast_matrix_4 == expected_matrix_4 + @test size(contrast_matrix_4) == (4, 3) + + # Test that fit is correct for backward Coding + cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :backward_diff) + k = length(levels(X.favnum)) + contrast_matrix = get_backward_diff_contrast(k) + for (i, level) in enumerate(levels(X.favnum)) + @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] + end +end + +@testset "Forward Difference Coding Tests" begin + for k in 2:5 + backward_matrix = get_backward_diff_contrast(k) + forward_matrix = get_forward_diff_contrast(k) + @test forward_matrix == -backward_matrix + @test size(forward_matrix) == (k, k-1) + end + + # Test that fit is correct for forward Coding + cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :forward_diff) + k = length(levels(X.favnum)) + contrast_matrix = get_forward_diff_contrast(k) + for (i, level) in enumerate(levels(X.favnum)) + @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] + end +end + +@testset "helmert_vector function tests" begin + @test create_helmert_vector(1, 5) == [-1.0, 1.0, 0.0, 0.0, 0.0] + @test create_helmert_vector(2, 5) == [-1.0, -1.0, 2.0, 0.0, 0.0] + @test create_helmert_vector(3, 5) == [-1.0, -1.0, -1.0, 3.0, 0.0] + @test create_helmert_vector(4, 5) == [-1.0, -1.0, -1.0, -1.0, 4.0] + @test create_helmert_vector(1, 3) == [-1.0, 1.0, 0.0] + @test create_helmert_vector(2, 3) == [-1.0, -1.0, 2.0] + k = 4 + @test get_helmert_contrast(k) == [ + -1.0 -1.0 -1.0 + 1.0 -1.0 -1.0 + 0.0 2.0 -1.0 + 0.0 0.0 3.0] + # test that fit is correct for helmert Coding + cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :helmert) + k = length(levels(X.name)) + contrast_matrix = get_helmert_contrast(k) + for (i, level) in enumerate(levels(X.name)) + @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] + end +end + + +# @testset "contrast matrix end-to-end test" +@testset "contrast mode end-to-end test" begin + + + function buildrandomcontrast(colname, k) + return rand(StableRNGs.StableRNG(123), k, k-1) + end + + cache = contrast_encoder_fit(X; mode=:contrast, buildmatrix=buildrandomcontrast) + + X_tr = contrast_encoder_transform(X, cache) + X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] + + + df = DataFrame(X) + + mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( + :name => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 3)), + :favnum=> StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 4)) + )) + + X_tr_sm = ModelMatrix(mf).m[:, 2:end] + + @test X_tr_mlj == X_tr_sm +end + +# @testset "hypothesis matrix end-to-end test" +@testset "hypothesis mode end-to-end test" begin + + function buildrandomhypothesis(colname, k) + return rand(StableRNGs.StableRNG(123), k-1, k) + end + + cache = contrast_encoder_fit(X; mode=:hypothesis, buildmatrix=buildrandomhypothesis) + X_tr = contrast_encoder_transform(X, cache) + X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] + df = DataFrame(X) + + mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( + :name => HypothesisCoding(buildrandomhypothesis(nothing, 3); levels=levels(X.name), labels=[]), + :favnum=> HypothesisCoding(buildrandomhypothesis(nothing, 4); levels=levels(X.favnum), labels=[]) + )) + + X_tr_sm = ModelMatrix(mf).m[:, 2:end] + + @test X_tr_mlj == X_tr_sm +end + + +function buildrandomhypothesis(colname, k) + return rand(StableRNGs.StableRNG(123), k-1, k) +end + +function buildrandomcontrast(colname, k) + return rand(StableRNGs.StableRNG(123), k, k-1) +end + +@testset "single-mode end-to-end test with StatsModels" begin + # test end-to-end single_column transformations + for ind in 1:6 + stats_models(k, ind) = [ + StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), + DummyCoding(; base=(k == 3) ? "Mary" : 10), + EffectsCoding(; base=(k == 3) ? "Mary" : 10), + SeqDiffCoding(), + HelmertCoding(), + HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), + ][ind] + modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] + matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] + + # Try MLJTransforms + cache = contrast_encoder_fit(X; mode=modes[ind], buildmatrix=matrix_func[ind]) + X_tr = contrast_encoder_transform(X, cache) + + df = DataFrame(X) + + mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( + :name => stats_models(3, ind), + :favnum=> stats_models(4, ind), + )) + + X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] + X_tr_sm = ModelMatrix(mf).m[:, 2:end] + @test X_tr_mlj ≈ X_tr_sm + end +end + +@testset "multi-mode end-to-end test with StatsModels" begin + # test end-to-end single_column transformations + for ind1 in 1:6 + for ind2 in 2:5 + stats_models(k, ind) = [ + StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), + DummyCoding(; base=(k == 3) ? "Mary" : 10), + EffectsCoding(; base=(k == 3) ? "Mary" : 10), + SeqDiffCoding(), + HelmertCoding(), + HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), + ][ind] -# modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] -# matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] + modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] + matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] -# # Try MLJTransforms -# cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode=[modes[ind1], modes[ind2]], buildmatrix=matrix_func[ind1]) -# X_tr = contrast_encoder_transform(X, cache) + # Try MLJTransforms + cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode=[modes[ind1], modes[ind2]], buildmatrix=matrix_func[ind1]) + X_tr = contrast_encoder_transform(X, cache) -# df = DataFrame(X) - -# mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( -# :name => stats_models(3, ind1), -# :favnum=> stats_models(4, ind2), -# )) - -# X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] -# X_tr_sm = ModelMatrix(mf).m[:, 2:end] - -# @test X_tr_mlj ≈ X_tr_sm -# end -# end -# end + df = DataFrame(X) + + mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( + :name => stats_models(3, ind1), + :favnum=> stats_models(4, ind2), + )) + + X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] + X_tr_sm = ModelMatrix(mf).m[:, 2:end] + + @test X_tr_mlj ≈ X_tr_sm + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 8cf6385..b867435 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -25,11 +25,11 @@ _get(x) = CategoricalArrays.DataAPI.unwrap(x) include("utils.jl") include("generic.jl") -# include("encoders/target_encoding.jl") -# include("encoders/ordinal_encoding.jl") -# include("encoders/frequency_encoder.jl") -# include("transformers/cardinality_reducer.jl") -# include("encoders/missingness_encoding.jl") +include("encoders/target_encoding.jl") +include("encoders/ordinal_encoding.jl") +include("encoders/frequency_encoder.jl") +include("transformers/cardinality_reducer.jl") +include("encoders/missingness_encoding.jl") include("encoders/contrast_encoder.jl") # Other transformers From 52ba39de812678086b5c6b54792a27122bba2b58 Mon Sep 17 00:00:00 2001 From: Essam Date: Tue, 27 May 2025 06:47:53 +0300 Subject: [PATCH 7/7] =?UTF-8?q?=E2=9C=A8=20Add=20support=20for=20single=20?= =?UTF-8?q?vector?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Project.toml | 2 +- src/generic.jl | 8 ++++++-- test/generic.jl | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 101d4f4..73054a3 100644 --- a/Project.toml +++ b/Project.toml @@ -26,7 +26,7 @@ ScientificTypes = "3.0" StatsBase = "0.34" TableOperations = "1.2" Tables = "1.11" -julia = "1.6" +julia = "1.10" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" diff --git a/src/generic.jl b/src/generic.jl index 00051b5..ed41ac2 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -16,7 +16,7 @@ logic?" - X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor` - features=[]: A list of names of categorical features given as symbols to exclude or include from encoding, - according to the value of `ignore` + according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol), or a callable that returns true for features to be included/excluded - ignore=true: Whether to exclude or includes the features given in features - ordered_factor=false: Whether to encode OrderedFactor or ignore them @@ -40,8 +40,12 @@ function generic_fit(X, feat_names = Tables.schema(X).names #2. Modify column_names based on features + if features isa Symbol + features = [features] + end + if features isa AbstractVector{Symbol} - # Original behavior for vector of symbols + # Original behavior for vector of symbols feat_names = (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) else diff --git a/test/generic.jl b/test/generic.jl index ddcc9fd..3e3212e 100644 --- a/test/generic.jl +++ b/test/generic.jl @@ -182,4 +182,20 @@ end # Test 3: predicate with ordered_factor=true picks up ordered factors (e.g., :E) cache3 = dummy_encoder_fit(X, predicate; ignore=false, ordered_factor=true) @test Set(cache3[:encoded]) == Set([:A, :C, :E]) +end + +@testset "Single Symbol and list of one symbol equivalence" begin + X = dataset_forms[1] + feat_names = Tables.schema(X).names + + # Test 1: Single Symbol + single_symbol = :A + cache1 = dummy_encoder_fit(X, single_symbol; ignore=true, ordered_factor=false) + @test !(:A in cache1[:encoded]) + # Test 2: List of one symbol + single_symbol_list = [:A] + cache2 = dummy_encoder_fit(X, single_symbol_list; ignore=true, ordered_factor=false) + @test !(:A in cache2[:encoded]) + # Test 3: Both should yield the same result + @test cache1[:encoded] == cache2[:encoded] end \ No newline at end of file