diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 64a4f48..7fca62c 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -24,7 +24,6 @@ jobs: matrix: version: - '1.10' - - '1.6' - 'nightly' os: - ubuntu-latest diff --git a/Project.toml b/Project.toml index 5324dab..73054a3 100644 --- a/Project.toml +++ b/Project.toml @@ -26,7 +26,7 @@ ScientificTypes = "3.0" StatsBase = "0.34" TableOperations = "1.2" Tables = "1.11" -julia = "1.6.7" +julia = "1.10" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl index 486e7e9..e6f8138 100644 --- a/src/encoders/contrast_encoder/contrast_encoder.jl +++ b/src/encoders/contrast_encoder/contrast_encoder.jl @@ -79,7 +79,7 @@ Fit a contrast encoing scheme on given data in `X`. """ function contrast_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; mode::Union{Symbol, AbstractVector{Symbol}} = :dummy, buildmatrix = nothing, ignore::Bool = true, diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl index 6585361..18087b1 100644 --- a/src/encoders/contrast_encoder/interface_mlj.jl +++ b/src/encoders/contrast_encoder/interface_mlj.jl @@ -1,11 +1,11 @@ ### ContrastEncoding with MLJ Interface # 1. Interface Struct -mutable struct ContrastEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised - features::AS +mutable struct ContrastEncoder{ASS <: Union{Symbol, AbstractVector{Symbol}}, A1 <: Any, A2 <: Any} <: Unsupervised + features::A1 ignore::Bool - mode::Union{Symbol, AS} - buildmatrix::Any + mode:: ASS + buildmatrix::A2 ordered_factor::Bool end; diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl index 928f45a..02bf325 100644 --- a/src/encoders/frequency_encoding/frequency_encoding.jl +++ b/src/encoders/frequency_encoding/frequency_encoding.jl @@ -20,7 +20,7 @@ categorical features with their (normalized or raw) frequencies of occurrence in """ function frequency_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, normalize::Bool = false, diff --git a/src/encoders/frequency_encoding/interface_mlj.jl b/src/encoders/frequency_encoding/interface_mlj.jl index 1e477b2..674f0c7 100644 --- a/src/encoders/frequency_encoding/interface_mlj.jl +++ b/src/encoders/frequency_encoding/interface_mlj.jl @@ -1,8 +1,8 @@ ### FrequencyEncoding with MLJ Interface # 1. Interface Struct -mutable struct FrequencyEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised - features::AS +mutable struct FrequencyEncoder{A <: Any} <: Unsupervised + features::A ignore::Bool ordered_factor::Bool normalize::Bool diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl index d39228e..85c3802 100644 --- a/src/encoders/missingness_encoding/interface_mlj.jl +++ b/src/encoders/missingness_encoding/interface_mlj.jl @@ -2,11 +2,10 @@ # 1. Interface Struct mutable struct MissingnessEncoder{ - AS <: AbstractVector{Symbol}, T <: Type, A <: Any, } <: Unsupervised - features::AS + features::A ignore::Bool ordered_factor::Bool label_for_missing::Dict{T, A} diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl index 0e4e39a..ee19240 100644 --- a/src/encoders/missingness_encoding/missingness_encoding.jl +++ b/src/encoders/missingness_encoding/missingness_encoding.jl @@ -27,7 +27,7 @@ types that are in `Char`, `AbstractString`, and `Number`. """ function missingness_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, label_for_missing::Dict{<:Type, <:Any} = Dict( diff --git a/src/encoders/ordinal_encoding/interface_mlj.jl b/src/encoders/ordinal_encoding/interface_mlj.jl index 86549d5..67b2b0f 100644 --- a/src/encoders/ordinal_encoding/interface_mlj.jl +++ b/src/encoders/ordinal_encoding/interface_mlj.jl @@ -1,8 +1,8 @@ ### OrdinalEncoding with MLJ Interface # 1. Interface Struct -mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised - features::AS +mutable struct OrdinalEncoder{A <: Any} <: Unsupervised + features::A ignore::Bool ordered_factor::Bool output_type::Type diff --git a/src/encoders/ordinal_encoding/ordinal_encoding.jl b/src/encoders/ordinal_encoding/ordinal_encoding.jl index c3c7d0a..5b966f6 100644 --- a/src/encoders/ordinal_encoding/ordinal_encoding.jl +++ b/src/encoders/ordinal_encoding/ordinal_encoding.jl @@ -18,7 +18,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as """ function ordinal_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, output_type::Type = Float32, diff --git a/src/encoders/target_encoding/interface_mlj.jl b/src/encoders/target_encoding/interface_mlj.jl index 686c48d..365e788 100644 --- a/src/encoders/target_encoding/interface_mlj.jl +++ b/src/encoders/target_encoding/interface_mlj.jl @@ -1,9 +1,9 @@ ### TargetEncoding with MLJ Interface # 1. Interface Struct -mutable struct TargetEncoder{R1 <: Real, R2 <: Real, AS <: AbstractVector{Symbol}} <: +mutable struct TargetEncoder{R1 <: Real, R2 <: Real, A <: Any} <: Unsupervised - features::AS + features::A ignore::Bool ordered_factor::Bool lambda::R1 @@ -45,7 +45,7 @@ end struct TargetEncoderResult{ I <: Integer, S <: AbstractString, - A <: Any # Useless but likely can't do much better + A <: Any, # Useless but likely can't do much better } <: MMI.MLJType # target statistic for each level of each categorical feature y_stat_given_feat_level::Dict{A, A} diff --git a/src/encoders/target_encoding/target_encoding.jl b/src/encoders/target_encoding/target_encoding.jl index e70d628..e959ee9 100644 --- a/src/encoders/target_encoding/target_encoding.jl +++ b/src/encoders/target_encoding/target_encoding.jl @@ -132,7 +132,7 @@ Fit a target encoder on table X with target y by computing the necessary statist function target_encoder_fit( X, y::AbstractVector, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, lambda::Real = 1.0, diff --git a/src/generic.jl b/src/generic.jl index 8e909b0..ed41ac2 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -13,21 +13,23 @@ logic?" # Arguments - - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) + - X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor` - - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - - `ignore=true`: Whether to exclude or includes the features given in `features` - - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them - - `feature_mapper`: Defined above. + - features=[]: A list of names of categorical features given as symbols to exclude or include from encoding, + according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol), + or a callable that returns true for features to be included/excluded + - ignore=true: Whether to exclude or includes the features given in features + - ordered_factor=false: Whether to encode OrderedFactor or ignore them + - feature_mapper: Defined above. # Returns - - `mapping_per_feat_level`: Maps each level for each feature in a subset of the categorical features of + - mapping_per_feat_level: Maps each level for each feature in a subset of the categorical features of X into a scalar or a vector. - - `encoded_features`: The subset of the categorical features of X that were encoded + - encoded_features: The subset of the categorical features of X that were encoded """ function generic_fit(X, - features::AbstractVector{Symbol} = Symbol[], + features = Symbol[], args...; ignore::Bool = true, ordered_factor::Bool = false, @@ -38,7 +40,22 @@ function generic_fit(X, feat_names = Tables.schema(X).names #2. Modify column_names based on features - feat_names = (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) + if features isa Symbol + features = [features] + end + + if features isa AbstractVector{Symbol} + # Original behavior for vector of symbols + feat_names = + (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) + else + # If features is a callable, apply it to each feature name + if ignore + feat_names = filter(name -> !features(name), feat_names) + else + feat_names = filter(features, feat_names) + end + end # 3. Define mapping per column per level dictionary mapping_per_feat_level = Dict() diff --git a/src/transformers/cardinality_reducer/cardinality_reducer.jl b/src/transformers/cardinality_reducer/cardinality_reducer.jl index 0ad9d5c..fbc374a 100644 --- a/src/transformers/cardinality_reducer/cardinality_reducer.jl +++ b/src/transformers/cardinality_reducer/cardinality_reducer.jl @@ -32,7 +32,7 @@ types that are in `Char`, `AbstractString`, and `Number`. """ function cardinality_reducer_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, min_frequency::Real = 3, diff --git a/src/transformers/cardinality_reducer/interface_mlj.jl b/src/transformers/cardinality_reducer/interface_mlj.jl index 31af464..3ad7a65 100644 --- a/src/transformers/cardinality_reducer/interface_mlj.jl +++ b/src/transformers/cardinality_reducer/interface_mlj.jl @@ -2,12 +2,11 @@ # 1. Interface Struct mutable struct CardinalityReducer{ - AS <: AbstractVector{Symbol}, R <: Real, T <: Type, A <: Any, } <: Unsupervised - features::AS + features::A ignore::Bool ordered_factor::Bool min_frequency::R diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl index d58f76c..a36b527 100644 --- a/test/encoders/contrast_encoder.jl +++ b/test/encoders/contrast_encoder.jl @@ -9,7 +9,6 @@ age = [23, 23, 14, 23]) @testset "Contrast Encoder Error Handling" begin - # Example definitions to allow the test to run function dummy_buildmatrix(colname, k) # Simple dummy function to generate a matrix of correct size @@ -23,21 +22,35 @@ age = [23, 23, 14, 23]) ) # Test IGNORE_MUST_FALSE_VEC_MODE error - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true) + @test_throws MLJTransforms.IGNORE_MUST_FALSE_VEC_MODE begin + contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true) + end # Test LENGTH_MISMATCH_VEC_MODE error - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false) + @test_throws MLJTransforms.LENGTH_MISMATCH_VEC_MODE(2, 1) begin + contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false) + end # Test BUILDFUNC_MUST_BE_SPECIFIED error - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false) + @test_throws MLJTransforms.BUILDFUNC_MUST_BE_SPECIFIED begin + contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false) + end # Test MATRIX_SIZE_ERROR wrong_buildmatrix = (levels, k) -> randn(k, k) # Incorrect dimensions - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false) + k = 3 # Number of levels in data[:A] + wrong_size = (k, k) + @test_throws MLJTransforms.MATRIX_SIZE_ERROR(k, wrong_size, :A) begin + contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false) + end # Test MATRIX_SIZE_ERROR_HYP wrong_buildmatrix_hyp = (levels, k) -> randn(k, k+1) # Incorrect dimensions for hypothesis matrix - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false) + wrong_size_hyp = (k, k+1) + @test_throws MLJTransforms.MATRIX_SIZE_ERROR_HYP(k, wrong_size_hyp, :A) begin + contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false) + end + end @testset "Dummy Coding Tests" begin diff --git a/test/encoders/missingness_encoding.jl b/test/encoders/missingness_encoding.jl index d01d90f..6131163 100644 --- a/test/encoders/missingness_encoding.jl +++ b/test/encoders/missingness_encoding.jl @@ -1,21 +1,26 @@ using MLJTransforms: missingness_encoder_fit, missingness_encoder_transform -@testset "Throws errors when needed" begin - @test_throws ArgumentError begin +@testset "Missingness Encoder Error Handling" begin + # Test COLLISION_NEW_VAL_ME error - when label_for_missing value already exists in levels + @test_throws MLJTransforms.COLLISION_NEW_VAL_ME("missing") begin X = generate_X_with_missingness(;john_name="missing") cache = missingness_encoder_fit( X; label_for_missing = Dict(AbstractString => "missing", Char => 'm'), ) end - @test_throws ArgumentError begin + + # Test VALID_TYPES_NEW_VAL_ME error - when label_for_missing key is not a supported type + @test_throws MLJTransforms.VALID_TYPES_NEW_VAL_ME(Bool) begin X = generate_X_with_missingness() cache = missingness_encoder_fit( X; label_for_missing = Dict(AbstractString => "Other", Bool => 'X'), ) end - @test_throws ArgumentError begin + + # Test UNSPECIFIED_COL_TYPE_ME error - when column type isn't in label_for_missing + @test_throws MLJTransforms.UNSPECIFIED_COL_TYPE_ME(Char, Dict(AbstractString => "X")) begin X = generate_X_with_missingness() cache = missingness_encoder_fit( X; diff --git a/test/encoders/target_encoding.jl b/test/encoders/target_encoding.jl index 4e6f0d0..d9b42d7 100644 --- a/test/encoders/target_encoding.jl +++ b/test/encoders/target_encoding.jl @@ -332,13 +332,15 @@ end @test fitresult.task == generic_cache[:task] # Test invalid `m` - @test_throws ArgumentError begin - t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 0.5, m = -5) + invalid_m = -5 + @test_throws MLJTransforms.NON_NEGATIVE_m(invalid_m) begin + t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 0.5, m = invalid_m) end - - # Test invalid `lambda` - @test_throws ArgumentError begin - t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 1.1, m = 1) + + # Test invalid `lambda` (value > 1) + invalid_lambda = 1.1 + @test_throws MLJTransforms.INVALID_lambda(invalid_lambda) begin + t = TargetEncoder(ignore = true, ordered_factor = false, lambda = invalid_lambda, m = 1) end # Test report diff --git a/test/generic.jl b/test/generic.jl index 4d9a805..3e3212e 100644 --- a/test/generic.jl +++ b/test/generic.jl @@ -62,7 +62,7 @@ end # Dummy encoder that maps each level to its hash (some arbitrary function) function dummy_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, ) @@ -81,6 +81,7 @@ function dummy_encoder_fit( ) cache = Dict( :hash_given_feat_val => hash_given_feat_val, + :encoded => encoded_features, ) return cache end @@ -161,4 +162,40 @@ end F = [enc(:F, X[:F][i]) for i in 1:10] ) @test X_tr == target +end + +@testset "Callable feature functionality tests" begin + X = dataset_forms[1] + feat_names = Tables.schema(X).names + + # Define a predicate: include only columns with name in uppercase list [:A, :C, :E] + predicate = name -> name in [:A, :C, :E] + + # Test 1: ignore=true should exclude predicate columns + cache1 = dummy_encoder_fit(X, predicate; ignore=true, ordered_factor=false) + @test !(:A in cache1[:encoded]) && !(:C in cache1[:encoded]) && !(:E in cache1[:encoded]) + + # Test 2: ignore=false should include only predicate columns + cache2 = dummy_encoder_fit(X, predicate; ignore=false, ordered_factor=false) + @test Set(cache2[:encoded]) == Set([:A, :C]) + + # Test 3: predicate with ordered_factor=true picks up ordered factors (e.g., :E) + cache3 = dummy_encoder_fit(X, predicate; ignore=false, ordered_factor=true) + @test Set(cache3[:encoded]) == Set([:A, :C, :E]) +end + +@testset "Single Symbol and list of one symbol equivalence" begin + X = dataset_forms[1] + feat_names = Tables.schema(X).names + + # Test 1: Single Symbol + single_symbol = :A + cache1 = dummy_encoder_fit(X, single_symbol; ignore=true, ordered_factor=false) + @test !(:A in cache1[:encoded]) + # Test 2: List of one symbol + single_symbol_list = [:A] + cache2 = dummy_encoder_fit(X, single_symbol_list; ignore=true, ordered_factor=false) + @test !(:A in cache2[:encoded]) + # Test 3: Both should yield the same result + @test cache1[:encoded] == cache2[:encoded] end \ No newline at end of file diff --git a/test/transformers/cardinality_reducer.jl b/test/transformers/cardinality_reducer.jl index e0d0dc3..c33e29a 100644 --- a/test/transformers/cardinality_reducer.jl +++ b/test/transformers/cardinality_reducer.jl @@ -1,30 +1,36 @@ using MLJTransforms: cardinality_reducer_fit, cardinality_reducer_transform - -@testset "Throws errors when needed" begin - @test_throws ArgumentError begin +@testset "Cardinality Reducer Error Handling" begin + # Test COLLISION_NEW_VAL error - when label_for_infrequent value already exists in data + @test_throws MLJTransforms.COLLISION_NEW_VAL('X') begin X = generate_high_cardinality_table(1000; obj = false, special_cat = 'X') cache = cardinality_reducer_fit( X; label_for_infrequent = Dict(AbstractString => "Other", Char => 'X'), ) end - @test_throws ArgumentError begin + + # Test VALID_TYPES_NEW_VAL error - when label_for_infrequent key is not a supported type + @test_throws MLJTransforms.VALID_TYPES_NEW_VAL(Bool) begin X = generate_high_cardinality_table(1000; obj = false, special_cat = 'O') cache = cardinality_reducer_fit( X; label_for_infrequent = Dict(AbstractString => "Other", Bool => 'X'), ) end - @test_throws ArgumentError begin + + # Test UNSPECIFIED_COL_TYPE error - when column type isn't in label_for_infrequent + @test_throws MLJTransforms.UNSPECIFIED_COL_TYPE(Char, Dict(AbstractString => "X")) begin X = generate_high_cardinality_table(1000) cache = cardinality_reducer_fit( X; min_frequency = 30, label_for_infrequent = Dict(AbstractString => "X"), + # Missing Char type in label_for_infrequent, which should be present in X ) end + end