diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 64a4f48..8793288 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -24,8 +24,7 @@ jobs: matrix: version: - '1.10' - - '1.6' - - 'nightly' + - '1' os: - ubuntu-latest arch: diff --git a/.gitignore b/.gitignore index 9df13d3..8d91ae8 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ meh/*.ipynb .DS_Store /*.jl scratchpad/ +examples/test.jl diff --git a/Project.toml b/Project.toml index 5324dab..27579db 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJTransforms" uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6" authors = ["Essam and contributors"] -version = "1.0.0-DEV" +version = "0.1.0" [deps] BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf" @@ -20,13 +20,21 @@ TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] +BitBasis = "0.9" CategoricalArrays = "0.10" MLJModelInterface = "1.11" +Combinatorics = "1" +Dates = "1" +Distributions = "0.25" +LinearAlgebra = "1" +OrderedCollections = "1" +Parameters = "0.12" ScientificTypes = "3.0" +Statistics = "1" StatsBase = "0.34" TableOperations = "1.2" Tables = "1.11" -julia = "1.6.7" +julia = "1.10" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl index f490241..a957ca6 100644 --- a/src/MLJTransforms.jl +++ b/src/MLJTransforms.jl @@ -7,7 +7,7 @@ using MLJModelInterface using TableOperations using StatsBase using LinearAlgebra - +using OrderedCollections: OrderedDict # Other transformers using Combinatorics import Distributions @@ -19,6 +19,7 @@ using OrderedCollections const MMI = MLJModelInterface # Functions of generic use across transformers +include("common_docs.jl") include("generic.jl") include("utils.jl") diff --git a/src/common_docs.jl b/src/common_docs.jl new file mode 100644 index 0000000..46bff23 --- /dev/null +++ b/src/common_docs.jl @@ -0,0 +1,27 @@ +const X_doc = """ +- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) + `Multiclass` or `OrderedFactor` +""" +const X_doc_mlj = """ +- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must + have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to + check scitypes. +""" +const features_doc = """ +- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding, + according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol), + or a callable that returns true for features to be included/excluded +""" +const ignore_doc = """ +- ignore=true: Whether to exclude or include the features given in `features` +""" +const ordered_factor_doc = """ +- ordered_factor=false: Whether to encode `OrderedFactor` or ignore them +""" +const encoded_features_doc = """ +- encoded_features: The subset of the categorical features of `X` that were encoded +""" +const cache_doc = """ +- `cache`: The output of `contrast_encoder_fit` +""" + diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl index b6cdcb0..63db2e5 100644 --- a/src/encoders/contrast_encoder/contrast_encoder.jl +++ b/src/encoders/contrast_encoder/contrast_encoder.jl @@ -9,13 +9,13 @@ Where `k` is the number of levels in the feature and the returned contrast matri """ ### 1. Dummy Coding function get_dummy_contrast(k) - return Matrix(1.0I, k, k-1) + return Matrix(1.0I, k, k - 1) end ### 2. Sum Coding function get_sum_contrast(k) - C = Matrix(1.0I, k, k-1) + C = Matrix(1.0I, k, k - 1) C[end, :] .= -1.0 return C end @@ -26,7 +26,7 @@ function create_backward_vector(index::Int, length::Int) vec = ones(length) .* index / length # [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k] - vec[1:index] .= index/length - 1 + vec[1:index] .= index / length - 1 return vec end function get_backward_diff_contrast(k) @@ -61,25 +61,25 @@ Fit a contrast encoing scheme on given data in `X`. # Arguments - - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor` - - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding + $X_doc + $features_doc - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`. - If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different - contrast encoding scheme for each feature - - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, - where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or - hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`. - - `ignore=true`: Whether to exclude or includes the features given in `features` - - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them + If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different + contrast encoding scheme for each feature + - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, + where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or + hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`. + $ignore_doc + $ordered_factor_doc -# Returns (in a dict) +# Returns as a named-tuple - `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector - - `encoded_features`: The subset of the categorical features of X that were encoded + $encoded_features_doc """ function contrast_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; mode::Union{Symbol, AbstractVector{Symbol}} = :dummy, buildmatrix = nothing, ignore::Bool = true, @@ -90,9 +90,10 @@ function contrast_encoder_fit( if mode isa Vector{Symbol} mode_is_vector = true ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE)) - length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features)))) + length(features) == length(mode) || + throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features)))) end - + # buildmatrix should be specified if mode is :contrast or :hypothesis if mode in (:contrast, :hypothesis) buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED)) @@ -105,11 +106,13 @@ function contrast_encoder_fit( k = length(feat_levels) feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode if feat_mode == :contrast - contrastmatrix = buildmatrix(name, k) - size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name))) + contrastmatrix = buildmatrix(name, k) + size(contrastmatrix) == (k, k - 1) || + throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name))) elseif feat_mode == :hypothesis - hypothesismatrix = buildmatrix(name, k) - size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name))) + hypothesismatrix = buildmatrix(name, k) + size(hypothesismatrix) == (k - 1, k) || + throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name))) contrastmatrix = pinv(hypothesismatrix) elseif feat_mode == :dummy contrastmatrix = get_dummy_contrast(k) @@ -125,7 +128,9 @@ function contrast_encoder_fit( throw(ArgumentError("Mode $feat_mode is not supported.")) end - vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)) + vector_given_value_given_feature = OrderedDict( + level => contrastmatrix[l, :] for (l, level) in enumerate(feat_levels) + ) return vector_given_value_given_feature end @@ -134,10 +139,9 @@ function contrast_encoder_fit( X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper, ) - - cache = Dict( - :vector_given_value_given_feature => vector_given_value_given_feature, - :encoded_features => encoded_features, + cache = ( + vector_given_value_given_feature = vector_given_value_given_feature, + encoded_features = encoded_features, ) return cache @@ -157,7 +161,12 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia - `X_tr`: The table with selected features after the selected features are encoded by contrast encoding. """ -function contrast_encoder_transform(X, cache::Dict) - vector_given_value_given_feature = cache[:vector_given_value_given_feature] - return generic_transform(X, vector_given_value_given_feature, single_feat = false) -end \ No newline at end of file +function contrast_encoder_transform(X, cache::NamedTuple) + vector_given_value_given_feature = cache.vector_given_value_given_feature + return generic_transform( + X, + vector_given_value_given_feature, + single_feat = false; + use_levelnames = true, + ) +end diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl index 9c098fe..a42cd00 100644 --- a/src/encoders/contrast_encoder/interface_mlj.jl +++ b/src/encoders/contrast_encoder/interface_mlj.jl @@ -1,11 +1,11 @@ ### ContrastEncoding with MLJ Interface # 1. Interface Struct -mutable struct ContrastEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised - features::AS +mutable struct ContrastEncoder{ASS <: Union{Symbol, AbstractVector{Symbol}}, A1 <: Any, A2 <: Any} <: Unsupervised + features::A1 ignore::Bool - mode::Union{Symbol, AS} - buildmatrix::Any + mode:: ASS + buildmatrix::A2 ordered_factor::Bool end; @@ -36,9 +36,9 @@ function MMI.fit(transformer::ContrastEncoder, verbosity::Int, X) buildmatrix = transformer.buildmatrix, ordered_factor = transformer.ordered_factor, ) - fitresult = generic_cache[:vector_given_value_given_feature] + fitresult = generic_cache.vector_given_value_given_feature - report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features + report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features cache = nothing return fitresult, cache, report end; @@ -46,9 +46,8 @@ end; # 6. Transform method function MMI.transform(transformer::ContrastEncoder, fitresult, Xnew) - generic_cache = Dict( - :vector_given_value_given_feature => - fitresult, + generic_cache = ( + vector_given_value_given_feature = fitresult, ) Xnew_transf = contrast_encoder_transform(Xnew, generic_cache) return Xnew_transf @@ -87,23 +86,21 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with Here: -- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must - have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to - check scitypes. +$X_doc_mlj Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters -- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding +$features_doc - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`. If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different contrast encoding scheme for each feature - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`. -- `ignore=true`: Whether to exclude or includes the features given in `features` -- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them +$ignore_doc +$ordered_factor_doc # Operations @@ -121,7 +118,7 @@ The fields of `fitted_params(mach)` are: The fields of `report(mach)` are: -- `encoded_features`: The subset of the categorical features of X that were encoded +$encoded_features_doc # Examples @@ -148,12 +145,12 @@ mach = fit!(machine(encoder, X)) Xnew = transform(mach, X) julia > Xnew - (name_1 = [1.0, 0.0, 0.0, 0.0], - name_2 = [0.0, 1.0, 0.0, 1.0], + (name_John = [1.0, 0.0, 0.0, 0.0], + name_Mary = [0.0, 1.0, 0.0, 1.0], height = [1.85, 1.67, 1.5, 1.67], - favnum_1 = [0.0, 1.0, 0.0, -1.0], - favnum_2 = [2.0, -1.0, 0.0, -1.0], - favnum_3 = [-1.0, -1.0, 3.0, -1.0], + favnum_5 = [0.0, 1.0, 0.0, -1.0], + favnum_7 = [2.0, -1.0, 0.0, -1.0], + favnum_10 = [-1.0, -1.0, 3.0, -1.0], age = [23, 23, 14, 23],) ``` diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl index 39eee4f..936542c 100644 --- a/src/encoders/frequency_encoding/frequency_encoding.jl +++ b/src/encoders/frequency_encoding/frequency_encoding.jl @@ -7,39 +7,43 @@ categorical features with their (normalized or raw) frequencies of occurrence in # Arguments - - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor` - - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - - `ignore=true`: Whether to exclude or includes the features given in `features` - - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them + $X_doc + $features_doc + $ignore_doc + $ordered_factor_doc - `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts. -# Returns (in a dict) +# Returns as a named-tuple - `statistic_given_feat_val`: The frequency of each level of each selected categorical feature - - `encoded_features`: The subset of the categorical features of X that were encoded + $encoded_features_doc """ function frequency_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, normalize::Bool = false, + output_type::Type = Float32, ) # 1. Define feature mapper function feature_mapper(col, name) frequency_map = (!normalize) ? countmap(col) : proportionmap(col) - statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col)) + feat_levels = levels(col) + statistic_given_feat_val = Dict{eltype(feat_levels), output_type}( + level => frequency_map[level] for level in feat_levels + ) return statistic_given_feat_val end # 2. Pass it to generic_fit statistic_given_feat_val, encoded_features = generic_fit( X, features; ignore = ignore, ordered_factor = ordered_factor, - feature_mapper = feature_mapper, - ) - cache = Dict( - :statistic_given_feat_val => statistic_given_feat_val, - :encoded_features => encoded_features, + feature_mapper = feature_mapper) + + cache = ( + statistic_given_feat_val = statistic_given_feat_val, + encoded_features = encoded_features, ) return cache end @@ -58,7 +62,7 @@ Encode the levels of a categorical variable in a given table with their (normali - `X_tr`: The table with selected features after the selected features are encoded by frequency encoding. """ -function frequency_encoder_transform(X, cache::Dict) - statistic_given_feat_val = cache[:statistic_given_feat_val] +function frequency_encoder_transform(X, cache::NamedTuple) + statistic_given_feat_val = cache.statistic_given_feat_val return generic_transform(X, statistic_given_feat_val) end diff --git a/src/encoders/frequency_encoding/interface_mlj.jl b/src/encoders/frequency_encoding/interface_mlj.jl index 89bd88b..83c002f 100644 --- a/src/encoders/frequency_encoding/interface_mlj.jl +++ b/src/encoders/frequency_encoding/interface_mlj.jl @@ -1,11 +1,12 @@ ### FrequencyEncoding with MLJ Interface # 1. Interface Struct -mutable struct FrequencyEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised - features::AS +mutable struct FrequencyEncoder{A <: Any} <: Unsupervised + features::A ignore::Bool ordered_factor::Bool normalize::Bool + output_type::Type end; # 2. Constructor @@ -14,8 +15,9 @@ function FrequencyEncoder(; ignore = true, ordered_factor = false, normalize = false, + output_type = Float32, ) - return FrequencyEncoder(features, ignore, ordered_factor, normalize) + return FrequencyEncoder(features, ignore, ordered_factor, normalize, output_type) end; @@ -32,10 +34,11 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X) ignore = transformer.ignore, ordered_factor = transformer.ordered_factor, normalize = transformer.normalize, + output_type = transformer.output_type, ) - fitresult = generic_cache[:statistic_given_feat_val] + fitresult = generic_cache.statistic_given_feat_val - report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features + report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features cache = nothing return fitresult, cache, report end; @@ -43,9 +46,8 @@ end; # 6. Transform method function MMI.transform(transformer::FrequencyEncoder, fitresult, Xnew) - generic_cache = Dict( - :statistic_given_feat_val => - fitresult, + generic_cache = ( + statistic_given_feat_val = fitresult, ) Xnew_transf = frequency_encoder_transform(Xnew, generic_cache) return Xnew_transf @@ -84,18 +86,17 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with Here: -- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must - have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to - check scitypes. +$X_doc_mlj Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters -- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding -- `ignore=true`: Whether to exclude or include the features given in `features` -- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them -- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts. +$features_doc +$ignore_doc +$ordered_factor_doc +- ` normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts. +- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values. # Operations @@ -113,7 +114,7 @@ The fields of `fitted_params(mach)` are: The fields of `report(mach)` are: -- `encoded_features`: The subset of the categorical features of X that were encoded +$encoded_features_doc # Examples diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl index d39228e..7c08f41 100644 --- a/src/encoders/missingness_encoding/interface_mlj.jl +++ b/src/encoders/missingness_encoding/interface_mlj.jl @@ -2,11 +2,10 @@ # 1. Interface Struct mutable struct MissingnessEncoder{ - AS <: AbstractVector{Symbol}, T <: Type, A <: Any, } <: Unsupervised - features::AS + features::A ignore::Bool ordered_factor::Bool label_for_missing::Dict{T, A} @@ -40,9 +39,9 @@ function MMI.fit(transformer::MissingnessEncoder, verbosity::Int, X) ordered_factor = transformer.ordered_factor, label_for_missing = transformer.label_for_missing, ) - fitresult = generic_cache[:label_for_missing_given_feature] + fitresult = generic_cache.label_for_missing_given_feature - report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features + report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features cache = nothing return fitresult, cache, report end; @@ -50,9 +49,8 @@ end; # 6. Transform method function MMI.transform(transformer::MissingnessEncoder, fitresult, Xnew) - generic_cache = Dict( - :label_for_missing_given_feature => - fitresult, + generic_cache = ( + label_for_missing_given_feature = fitresult, ) Xnew_transf = missingness_encoder_transform(Xnew, generic_cache) return Xnew_transf @@ -92,17 +90,15 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with Here: -- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must - have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to - check scitypes. +$X_doc_mlj Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters -- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding -- `ignore=true`: Whether to exclude or includes the features given in `features` -- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them +$features_doc +$ignore_doc +$ordered_factor_doc - `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` @@ -125,7 +121,7 @@ The fields of `fitted_params(mach)` are: The fields of `report(mach)` are: -- `encoded_features`: The subset of the categorical features of X that were encoded +$encoded_features_doc # Examples diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl index 848916c..b01bd63 100644 --- a/src/encoders/missingness_encoding/missingness_encoding.jl +++ b/src/encoders/missingness_encoding/missingness_encoding.jl @@ -9,38 +9,38 @@ types that are in `Char`, `AbstractString`, and `Number`. # Arguments - - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) - `Multiclass` or `OrderedFactor` - - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - - `ignore=true`: Whether to exclude or includes the features given in `features` - - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them + $X_doc + $features_doc + $ignore_doc + $ordered_factor_doc - `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'` and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. -# Returns (in a dict) +# Returns as a named-tuple - `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing` - - `encoded_features`: The subset of the categorical features of X that were encoded + $encoded_features_doc """ function missingness_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, - label_for_missing::Dict{<:Type, <:Any} = Dict( + label_for_missing::Dict{<:Type, <:Any} = Dict( AbstractString => "missing", Char => 'm', ), ) - supportedtypes = Union{Char, AbstractString, Number} + supportedtypes_list = [Char, AbstractString, Number] + supportedtypes = Union{supportedtypes_list...} # 1. Define feature mapper function feature_mapper(col, name) - col_type = nonmissingtype(eltype(col)).parameters[1] - feat_levels = levels(col; skipmissing=true) + feat_levels = levels(col; skipmissing = true) + col_type = nonmissingtype(eltype(feat_levels)) # Ensure column type is valid (can't test because never occurs) # Converting array elements to strings before wrapping in a `CategoricalArray`, as... @@ -50,14 +50,14 @@ function missingness_encoder_fit( # Ensure label_for_missing keys are valid types for possible_col_type in keys(label_for_missing) - if !(possible_col_type in union_types(supportedtypes)) + if !(possible_col_type in supportedtypes_list) throw(ArgumentError(VALID_TYPES_NEW_VAL_ME(possible_col_type))) end end # Check no collision between keys(label_for_missing) and feat_levels for value in values(label_for_missing) - if !ismissing(value) + if !ismissing(value) if value in feat_levels throw(ArgumentError(COLLISION_NEW_VAL_ME(value))) end @@ -66,13 +66,13 @@ function missingness_encoder_fit( # Get ancestor type of column elgrandtype = nothing - for allowed_type in union_types(supportedtypes) + for allowed_type in supportedtypes_list if col_type <: allowed_type elgrandtype = allowed_type break end end - + # Nonmissing levels remain as is label_for_missing_given_feature = Dict{Missing, col_type}() @@ -90,11 +90,12 @@ function missingness_encoder_fit( # 2. Pass it to generic_fit label_for_missing_given_feature, encoded_features = generic_fit( - X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper, + X, features; ignore = ignore, ordered_factor = ordered_factor, + feature_mapper = feature_mapper, ) - cache = Dict( - :label_for_missing_given_feature => label_for_missing_given_feature, - :encoded_features => encoded_features, + cache = ( + label_for_missing_given_feature = label_for_missing_given_feature, + encoded_features = encoded_features, ) return cache end @@ -114,8 +115,13 @@ Apply a fitted missingness encoder to a table given the output of `missingness_e - `X_tr`: The table with selected features after the selected features are transformed by missingness encoder """ -function missingness_encoder_transform(X, cache::Dict) - label_for_missing_given_feature = cache[:label_for_missing_given_feature] - return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true) +function missingness_encoder_transform(X, cache::NamedTuple) + label_for_missing_given_feature = cache.label_for_missing_given_feature + return generic_transform( + X, + label_for_missing_given_feature; + ignore_unknown = true, + ensure_categorical = true, + ) end diff --git a/src/encoders/ordinal_encoding/interface_mlj.jl b/src/encoders/ordinal_encoding/interface_mlj.jl index c6b32cf..146b86c 100644 --- a/src/encoders/ordinal_encoding/interface_mlj.jl +++ b/src/encoders/ordinal_encoding/interface_mlj.jl @@ -1,10 +1,11 @@ ### OrdinalEncoding with MLJ Interface # 1. Interface Struct -mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised - features::AS +mutable struct OrdinalEncoder{A <: Any} <: Unsupervised + features::A ignore::Bool ordered_factor::Bool + output_type::Type end; # 2. Constructor @@ -12,8 +13,9 @@ function OrdinalEncoder(; features = Symbol[], ignore = true, ordered_factor = false, + output_type = Float32, ) - return OrdinalEncoder(features, ignore, ordered_factor) + return OrdinalEncoder(features, ignore, ordered_factor, output_type) end; @@ -29,10 +31,11 @@ function MMI.fit(transformer::OrdinalEncoder, verbosity::Int, X) transformer.features; ignore = transformer.ignore, ordered_factor = transformer.ordered_factor, + output_type = transformer.output_type, ) fitresult = - generic_cache[:index_given_feat_level] - report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features + generic_cache.index_given_feat_level + report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features cache = nothing return fitresult, cache, report end; @@ -40,9 +43,7 @@ end; # 6. Transform method function MMI.transform(transformer::OrdinalEncoder, fitresult, Xnew) - generic_cache = Dict( - :index_given_feat_level => fitresult, - ) + generic_cache = (index_given_feat_level = fitresult,) Xnew_transf = ordinal_encoder_transform(Xnew, generic_cache) return Xnew_transf end @@ -81,17 +82,16 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with Here: -- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must - have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to - check scitypes. +$X_doc_mlj Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters -- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding -- `ignore=true`: Whether to exclude or includes the features given in `features` -- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them +$features_doc +$ignore_doc +$ordered_factor_doc +- `output_type`: The numerical concrete type of the encoded features. Default is `Float32`. # Operations @@ -109,7 +109,7 @@ The fields of `fitted_params(mach)` are: The fields of `report(mach)` are: -- `encoded_features`: The subset of the categorical features of X that were encoded +$encoded_features_doc # Examples diff --git a/src/encoders/ordinal_encoding/ordinal_encoding.jl b/src/encoders/ordinal_encoding/ordinal_encoding.jl index 4afff9d..9d3d765 100644 --- a/src/encoders/ordinal_encoding/ordinal_encoding.jl +++ b/src/encoders/ordinal_encoding/ordinal_encoding.jl @@ -5,39 +5,41 @@ Fit an encoder to encode the levels of categorical variables in a given table as integers (ordered arbitrarily). # Arguments + $X_doc + $features_doc + $ignore_doc + $ordered_factor_doc + - `dtype`: The numerical concrete type of the encoded features. Default is `Float32`. - - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor` - - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - - `ignore=true`: Whether to exclude or includes the features given in `features` - - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them - -# Returns (in a dict) +# Returns as a named-tuple - `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer. - `encoded_features`: The subset of the categorical features of X that were encoded """ function ordinal_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, + output_type::Type = Float32, ) # 1. Define feature mapper function feature_mapper(col, name) feat_levels = levels(col) index_given_feat_val = - Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels)) + Dict{eltype(feat_levels), output_type}( + value => index for (index, value) in enumerate(feat_levels) + ) return index_given_feat_val end # 2. Pass it to generic_fit index_given_feat_level, encoded_features = generic_fit( X, features; ignore = ignore, ordered_factor = ordered_factor, - feature_mapper = feature_mapper, - ) - cache = Dict( - :index_given_feat_level => index_given_feat_level, - :encoded_features => encoded_features, + feature_mapper = feature_mapper) + cache = ( + index_given_feat_level = index_given_feat_level, + encoded_features = encoded_features, ) return cache end @@ -57,7 +59,7 @@ Encode the levels of a categorical variable in a given table as integers. - `X_tr`: The table with selected features after the selected features are encoded by ordinal encoding. """ -function ordinal_encoder_transform(X, cache::Dict) - index_given_feat_level = cache[:index_given_feat_level] +function ordinal_encoder_transform(X, cache::NamedTuple) + index_given_feat_level = cache.index_given_feat_level return generic_transform(X, index_given_feat_level) end diff --git a/src/encoders/target_encoding/interface_mlj.jl b/src/encoders/target_encoding/interface_mlj.jl index b416b90..3b9443b 100644 --- a/src/encoders/target_encoding/interface_mlj.jl +++ b/src/encoders/target_encoding/interface_mlj.jl @@ -1,9 +1,9 @@ ### TargetEncoding with MLJ Interface # 1. Interface Struct -mutable struct TargetEncoder{R1 <: Real, R2 <: Real, AS <: AbstractVector{Symbol}} <: +mutable struct TargetEncoder{R1 <: Real, R2 <: Real, A <: Any} <: Unsupervised - features::AS + features::A ignore::Bool ordered_factor::Bool lambda::R1 @@ -45,12 +45,14 @@ end struct TargetEncoderResult{ I <: Integer, S <: AbstractString, - A <: Any # Useless but likely can't do much better + A <: Any, # Useless but likely can't do much better } <: MMI.MLJType # target statistic for each level of each categorical feature y_stat_given_feat_level::Dict{A, A} task::S # "Regression", "Classification" num_classes::I # num_classes in case of classification + y_classes::A # y_classes in case of classification + end @@ -73,11 +75,12 @@ function MMI.fit(transformer::TargetEncoder, verbosity::Int, X, y) m = transformer.m, ) fitresult = TargetEncoderResult( - generic_cache[:y_stat_given_feat_level], - generic_cache[:task], - generic_cache[:num_classes], + generic_cache.y_stat_given_feat_level, + generic_cache.task, + generic_cache.num_classes, + generic_cache.y_classes, ) - report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features + report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features cache = nothing return fitresult, cache, report end; @@ -85,11 +88,11 @@ end; # 7. Transform method function MMI.transform(transformer::TargetEncoder, fitresult, Xnew) - generic_cache = Dict( - :y_stat_given_feat_level => - fitresult.y_stat_given_feat_level, - :num_classes => fitresult.num_classes, - :task => fitresult.task, + generic_cache = ( + y_stat_given_feat_level = fitresult.y_stat_given_feat_level, + num_classes = fitresult.num_classes, + task = fitresult.task, + y_classes = fitresult.y_classes, ) Xnew_transf = target_encoder_transform(Xnew, generic_cache) return Xnew_transf @@ -128,9 +131,7 @@ In MLJ (or MLJBase) bind an instance `model` to data with Here: -- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must - have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to - check scitypes. +$X_doc_mlj - `y` is the target, which can be any `AbstractVector` whose element scitype is `Continuous` or `Count` for regression problems and @@ -140,9 +141,9 @@ Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters -- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding -- `ignore=true`: Whether to exclude or includes the features given in `features` -- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them +$features_doc +$ignore_doc +$ordered_factor_doc - `λ`: Shrinkage hyperparameter used to mix between posterior and prior statistics as described in [1] - `m`: An integer hyperparameter to compute shrinkage as described in [1]. If `m=:auto` then m will be computed using empirical Bayes estimation as described in [1] @@ -165,7 +166,7 @@ The fields of `fitted_params(mach)` are: The fields of `report(mach)` are: -- `encoded_features`: The subset of the categorical features of X that were encoded +$encoded_features_doc # Examples diff --git a/src/encoders/target_encoding/target_encoding.jl b/src/encoders/target_encoding/target_encoding.jl index e7fa859..bb13a42 100644 --- a/src/encoders/target_encoding/target_encoding.jl +++ b/src/encoders/target_encoding/target_encoding.jl @@ -114,12 +114,11 @@ Fit a target encoder on table X with target y by computing the necessary statist # Arguments - - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) - `Multiclass` or `OrderedFactor` + $X_doc - `y`: An abstract vector of labels (e.g., strings) that correspond to the observations in X - - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - - `ignore=true`: Whether to exclude or includes the features given in `features` - - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them + $features_doc + $ignore_doc + $ordered_factor_doc - `λ`: Shrinkage hyperparameter used to mix between posterior and prior statistics as described in [1] - `m`: An integer hyperparameter to compute shrinkage as described in [1]. If `m=:auto` then m will be computed using empirical Bayes estimation as described in [1] @@ -132,7 +131,7 @@ Fit a target encoder on table X with target y by computing the necessary statist function target_encoder_fit( X, y::AbstractVector, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, lambda::Real = 1.0, @@ -166,8 +165,9 @@ function target_encoder_fit( # 3. Define function to compute the new value(s) for each level given a column function feature_mapper(col, name) + feat_levels = levels(col) y_stat_given_feat_level_for_col = - Dict{Any, Union{AbstractFloat, AbstractVector{<:AbstractFloat}}}() + Dict{eltype(feat_levels), Any}() for level in levels(col) # Get the targets of an example that belong to this level targets_for_level = y[col.==level] @@ -210,11 +210,12 @@ function target_encoder_fit( feature_mapper = feature_mapper, ) - cache = Dict( - :task => task, - :num_classes => (task == "Regression") ? -1 : length(y_classes), - :y_stat_given_feat_level => y_stat_given_feat_level, - :encoded_features => encoded_features, + cache = ( + task = task, + num_classes = (task == "Regression") ? -1 : length(y_classes), + y_stat_given_feat_level = y_stat_given_feat_level, + encoded_features = encoded_features, + y_classes = (task == "Regression") ? nothing : y_classes, ) return cache end @@ -240,14 +241,16 @@ every categorical feature as well as other metadata needed for transform """ function target_encoder_transform(X, cache) - task = cache[:task] - y_stat_given_feat_level = cache[:y_stat_given_feat_level] - num_classes = cache[:num_classes] + task = cache.task + y_stat_given_feat_level = cache.y_stat_given_feat_level + num_classes = cache.num_classes + y_classes = cache.y_classes return generic_transform( X, y_stat_given_feat_level; single_feat = task == "Regression" || (task == "Classification" && num_classes < 3), - ) + use_levelnames = true, + custom_levels = y_classes) end diff --git a/src/generic.jl b/src/generic.jl index 3acc398..a283d7d 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -1,6 +1,5 @@ # generic functions go here; such function can be used throughout multiple methods - """ **Private method.** @@ -13,21 +12,20 @@ logic?" # Arguments - - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) - `Multiclass` or `OrderedFactor` - - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - - `ignore=true`: Whether to exclude or includes the features given in `features` - - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them - - `feature_mapper`: Defined above. + $X_doc + $features_doc + $ignore_doc + $ordered_factor_doc + - feature_mapper: Defined above. # Returns - - `mapping_per_feat_level`: Maps each level for each feature in a subset of the categorical features of + - mapping_per_feat_level: Maps each level for each feature in a subset of the categorical features of X into a scalar or a vector. - - `encoded_features`: The subset of the categorical features of X that were encoded + $encoded_features_doc """ function generic_fit(X, - features::AbstractVector{Symbol} = Symbol[], + features = Symbol[], args...; ignore::Bool = true, ordered_factor::Bool = false, @@ -38,7 +36,22 @@ function generic_fit(X, feat_names = Tables.schema(X).names #2. Modify column_names based on features - feat_names = (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) + if features isa Symbol + features = [features] + end + + if features isa AbstractVector{Symbol} + # Original behavior for vector of symbols + feat_names = + (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features) + else + # If features is a callable, apply it to each feature name + if ignore + feat_names = filter(name -> !features(name), feat_names) + else + feat_names = filter(features, feat_names) + end + end # 3. Define mapping per column per level dictionary mapping_per_feat_level = Dict() @@ -49,11 +62,13 @@ function generic_fit(X, feat_col = Tables.getcolumn(X, feat_name) feat_type = elscitype(feat_col) feat_has_allowed_type = - feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor}) + feat_type <: Union{Missing, Multiclass} || + (ordered_factor && feat_type <: Union{Missing, OrderedFactor}) if feat_has_allowed_type # then should be encoded push!(encoded_features, feat_name) # Compute the dict using the given feature_mapper function - mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...) + mapping_per_feat_level[feat_name] = + feature_mapper(feat_col, feat_name, args...; kwargs...) end end return mapping_per_feat_level, encoded_features @@ -64,19 +79,37 @@ end """ **Private method.** -Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n +Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible, +feat_name_level_0, feat_name_level_1,..., feat_name_level_n """ -function generate_new_feat_names(feat_name, num_inds, existing_names) - conflict = true # will be kept true as long as there is a conflict - count = 1 # number of conflicts+1 = number of underscores +function generate_new_feat_names( + feat_name, + num_inds, + levels, + existing_names; + use_levelnames = true, +) + # Convert levels (e.g. KeySet or Tuple) to an indexable vector + levels_vec = collect(levels) + + conflict = true # true while there's a name clash + count = 1 # number of underscores in the suffix + new_column_names = Symbol[] - new_column_names = [] while conflict - suffix = repeat("_", count) - new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds] + suffix = repeat("_", count) + if use_levelnames + # Always use the first num_inds level names + new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ] + else + # Always use numeric indices + new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ] + end + # Check for collisions conflict = any(name -> name in existing_names, new_column_names) count += 1 end + return new_column_names end @@ -85,22 +118,33 @@ end """ **Private method.** -Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in -a subset of categorical features of X into a scalar or a vector (as specified in single_feat) +Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in +a subset of categorical features of X into a scalar or a vector (as specified in `single_feat`) - - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` - into a scalar (single_feat=true) + - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` + into a scalar (`single_feat=true`) - - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` - into a set of k features where k is the length of the vector (single_feat=false) + - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` + into a set of `k` features where `k` is the length of the vector (`single_feat=false`) - In both cases it attempts to preserve the type of the table. - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such - assumption is necessary because any column in X must correspond to a constant number of features + assumption is necessary because any column in X must correspond to a constant number of features in the output table (which is equal to k). - Features not in the dictionary are mapped to themselves (i.e., not changed). - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error. + - use_levelnames: if true, the new feature names are generated using the level names when the transform generates multiple features; + else they are generated using the indices of the levels. + - custom_levels: if not `nothing`, then the levels of the categorical features are replaced by the custom_levels """ -function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false) +function generic_transform( + X, + mapping_per_feat_level; + single_feat = true, + ignore_unknown = false, + use_levelnames = false, + custom_levels = nothing, + ensure_categorical = false, +) feat_names = Tables.schema(X).names new_feat_names = Symbol[] new_cols = [] @@ -115,25 +159,34 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore if !issubset(test_levels, train_levels) # get the levels in test that are not in train lost_levels = setdiff(test_levels, train_levels) - error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.") + error( + "While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.", + ) end end - + if single_feat level2scalar = mapping_per_feat_level[feat_name] - new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col + if ensure_categorical + new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col + else + new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col + end + push!(new_cols, new_col) push!(new_feat_names, feat_name) else level2vector = mapping_per_feat_level[feat_name] - new_multi_col = map(x->get(level2vector, x, x), col) + new_multi_col = map(x -> get(level2vector, x, x), col) new_multi_col = [col for col in eachrow(hcat(new_multi_col...))] push!(new_cols, new_multi_col...) feat_names_with_inds = generate_new_feat_names( feat_name, length(first(mapping_per_feat_level[feat_name])[2]), - feat_names, + (custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels, + feat_names; + use_levelnames = use_levelnames, ) push!(new_feat_names, feat_names_with_inds...) end @@ -144,8 +197,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore end end - transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...) + transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...) # Attempt to preserve table type transformed_X = Tables.materializer(X)(transformed_X) return transformed_X -end \ No newline at end of file +end diff --git a/src/transformers/cardinality_reducer/cardinality_reducer.jl b/src/transformers/cardinality_reducer/cardinality_reducer.jl index 1d8f531..a73e3aa 100644 --- a/src/transformers/cardinality_reducer/cardinality_reducer.jl +++ b/src/transformers/cardinality_reducer/cardinality_reducer.jl @@ -11,11 +11,10 @@ types that are in `Char`, `AbstractString`, and `Number`. # Arguments - - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) - `Multiclass` or `OrderedFactor` - - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - - `ignore=true`: Whether to exclude or includes the features given in `features` - - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them + $X_doc + $features_doc + $ignore_doc + $ordered_factor_doc - `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < `min_frequency` will be mapped to a new level. Could be an integer or a float which decides whether raw counts or normalized frequencies are used. - `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => 'O', )`: A @@ -24,30 +23,31 @@ types that are in `Char`, `AbstractString`, and `Number`. then the new value is `"Other"` and if the raw type subtypes `Char` then the new value is `'O'` and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. -# Returns (in a dict) +# Returns as a named-tuple - `new_cat_given_col_val`: A dictionary that maps each level in a categorical feature to a new level (either itself or the new level specified in `label_for_infrequent`) - - `encoded_features`: The subset of the categorical features of X that were encoded + $encoded_features_doc """ function cardinality_reducer_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, - min_frequency::Real = 3, - label_for_infrequent::Dict{<:Type, <:Any} = Dict( + min_frequency::Real = 3, + label_for_infrequent::Dict{<:Type, <:Any} = Dict( AbstractString => "Other", Char => 'O', ), -) - supportedtypes = Union{Char, AbstractString, Number} +) + supportedtypes_list = [Char, AbstractString, Number] + supportedtypes = Union{supportedtypes_list...} # 1. Define feature mapper function feature_mapper(col, name) val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col) - col_type = eltype(col).parameters[1] feat_levels = levels(col) + col_type = eltype(feat_levels) # Ensure column type is valid (can't test because never occurs) # Converting array elements to strings before wrapping in a `CategoricalArray`, as... @@ -57,7 +57,7 @@ function cardinality_reducer_fit( # Ensure label_for_infrequent keys are valid types for possible_col_type in keys(label_for_infrequent) - if !(possible_col_type in union_types(supportedtypes)) + if !(possible_col_type in supportedtypes_list) throw(ArgumentError(VALID_TYPES_NEW_VAL(possible_col_type))) end end @@ -71,7 +71,7 @@ function cardinality_reducer_fit( # Get ancestor type of column elgrandtype = nothing - for allowed_type in union_types(supportedtypes) + for allowed_type in supportedtypes_list if col_type <: allowed_type elgrandtype = allowed_type break @@ -87,7 +87,11 @@ function cardinality_reducer_fit( elseif elgrandtype == Number new_cat_given_col_val[level] = minimum(feat_levels) - 1 else - throw(ArgumentError(UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent))) + throw( + ArgumentError( + UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent), + ), + ) end end end @@ -97,11 +101,12 @@ function cardinality_reducer_fit( # 2. Pass it to generic_fit new_cat_given_col_val, encoded_features = generic_fit( - X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper, + X, features; ignore = ignore, ordered_factor = ordered_factor, + feature_mapper = feature_mapper, ) - cache = Dict( - :new_cat_given_col_val => new_cat_given_col_val, - :encoded_features => encoded_features, + cache = ( + new_cat_given_col_val = new_cat_given_col_val, + encoded_features = encoded_features, ) return cache end @@ -122,7 +127,12 @@ Apply a fitted cardinality reducer to a table given the output of `cardinality_r - `X_tr`: The table with selected features after the selected features are transformed by cardinality reducer """ -function cardinality_reducer_transform(X, cache::Dict) - new_cat_given_col_val = cache[:new_cat_given_col_val] - return generic_transform(X, new_cat_given_col_val; ignore_unknown = true) +function cardinality_reducer_transform(X, cache::NamedTuple) + new_cat_given_col_val = cache.new_cat_given_col_val + return generic_transform( + X, + new_cat_given_col_val; + ignore_unknown = true, + ensure_categorical = true, + ) end diff --git a/src/transformers/cardinality_reducer/interface_mlj.jl b/src/transformers/cardinality_reducer/interface_mlj.jl index 31af464..201d268 100644 --- a/src/transformers/cardinality_reducer/interface_mlj.jl +++ b/src/transformers/cardinality_reducer/interface_mlj.jl @@ -2,12 +2,11 @@ # 1. Interface Struct mutable struct CardinalityReducer{ - AS <: AbstractVector{Symbol}, R <: Real, T <: Type, A <: Any, } <: Unsupervised - features::AS + features::A ignore::Bool ordered_factor::Bool min_frequency::R @@ -25,7 +24,13 @@ function CardinalityReducer(; Char => 'O', ), ) - return CardinalityReducer(features, ignore, ordered_factor, min_frequency, label_for_infrequent) + return CardinalityReducer( + features, + ignore, + ordered_factor, + min_frequency, + label_for_infrequent, + ) end; @@ -44,9 +49,9 @@ function MMI.fit(transformer::CardinalityReducer, verbosity::Int, X) min_frequency = transformer.min_frequency, label_for_infrequent = transformer.label_for_infrequent, ) - fitresult = generic_cache[:new_cat_given_col_val] + fitresult = generic_cache.new_cat_given_col_val - report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features + report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features cache = nothing return fitresult, cache, report end; @@ -54,9 +59,8 @@ end; # 6. Transform method function MMI.transform(transformer::CardinalityReducer, fitresult, Xnew) - generic_cache = Dict( - :new_cat_given_col_val => - fitresult, + generic_cache = ( + new_cat_given_col_val = fitresult, ) Xnew_transf = cardinality_reducer_transform(Xnew, generic_cache) return Xnew_transf @@ -98,17 +102,15 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with Here: -- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must - have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to - check scitypes. +$X_doc_mlj Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters -- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding -- `ignore=true`: Whether to exclude or includes the features given in `features` -- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them +$features_doc +$ignore_doc +$ordered_factor_doc - `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < `min_frequency` will be mapped to a new level. Could be an integer or a float which decides whether raw counts or normalized frequencies are used. - `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => 'O', )`: A @@ -134,7 +136,7 @@ The fields of `fitted_params(mach)` are: The fields of `report(mach)` are: -- `encoded_features`: The subset of the categorical features of X that were encoded +$encoded_features_doc # Examples diff --git a/src/utils.jl b/src/utils.jl index de21a05..8ac976c 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,3 +1 @@ -# To go from e.g., Union{Integer, String} to (Integer, String) -union_types(x::Union) = (x.a, union_types(x.b)...) -union_types(x::Type) = (x,) \ No newline at end of file +# add utility functions here \ No newline at end of file diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl index fa110c7..0e252bd 100644 --- a/test/encoders/contrast_encoder.jl +++ b/test/encoders/contrast_encoder.jl @@ -1,19 +1,20 @@ -using MLJTransforms: contrast_encoder_fit, contrast_encoder_transform, get_dummy_contrast, get_sum_contrast, -create_backward_vector, get_backward_diff_contrast, get_forward_diff_contrast, create_helmert_vector, get_helmert_contrast, ContrastEncoder +using MLJTransforms: contrast_encoder_fit, contrast_encoder_transform, get_dummy_contrast, + get_sum_contrast, + create_backward_vector, get_backward_diff_contrast, get_forward_diff_contrast, + create_helmert_vector, get_helmert_contrast, ContrastEncoder stable_rng = StableRNGs.StableRNG(123) -X = (name = categorical(["Ben", "John", "Mary", "John"]), -height = [1.85, 1.67, 1.5, 1.67], -favnum = categorical([7, 5, 10, 1]), -age = [23, 23, 14, 23]) +X = (name = categorical(["Ben", "John", "Mary", "John"]), + height = [1.85, 1.67, 1.5, 1.67], + favnum = categorical([7, 5, 10, 1]), + age = [23, 23, 14, 23]) @testset "Contrast Encoder Error Handling" begin - # Example definitions to allow the test to run function dummy_buildmatrix(colname, k) # Simple dummy function to generate a matrix of correct size - return randn(k, k-1) # Adjust dimensions as needed for the test + return randn(k, k - 1) # Adjust dimensions as needed for the test end # Define a DataFrame or appropriate data structure to test with @@ -23,85 +24,117 @@ age = [23, 23, 14, 23]) ) # Test IGNORE_MUST_FALSE_VEC_MODE error - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true) + @test_throws MLJTransforms.IGNORE_MUST_FALSE_VEC_MODE begin + contrast_encoder_fit(data, [:A], mode = [:contrast], ignore = true) + end # Test LENGTH_MISMATCH_VEC_MODE error - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false) + @test_throws MLJTransforms.LENGTH_MISMATCH_VEC_MODE(2, 1) begin + contrast_encoder_fit( + data, + [:A], + mode = [:contrast, :dummy], + buildmatrix = dummy_buildmatrix, + ignore = false, + ) + end # Test BUILDFUNC_MUST_BE_SPECIFIED error - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false) + @test_throws MLJTransforms.BUILDFUNC_MUST_BE_SPECIFIED begin + contrast_encoder_fit(data, [:A], mode = :contrast, ignore = false) + end # Test MATRIX_SIZE_ERROR wrong_buildmatrix = (levels, k) -> randn(k, k) # Incorrect dimensions - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false) + k = 3 # Number of levels in data[:A] + wrong_size = (k, k) + @test_throws MLJTransforms.MATRIX_SIZE_ERROR(k, wrong_size, :A) begin + contrast_encoder_fit( + data, + [:A], + mode = :contrast, + buildmatrix = wrong_buildmatrix, + ignore = false, + ) + end # Test MATRIX_SIZE_ERROR_HYP - wrong_buildmatrix_hyp = (levels, k) -> randn(k, k+1) # Incorrect dimensions for hypothesis matrix - @test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false) + wrong_buildmatrix_hyp = (levels, k) -> randn(k, k + 1) # Incorrect dimensions for hypothesis matrix + wrong_size_hyp = (k, k + 1) + @test_throws MLJTransforms.MATRIX_SIZE_ERROR_HYP(k, wrong_size_hyp, :A) begin + contrast_encoder_fit( + data, + [:A], + mode = :hypothesis, + buildmatrix = wrong_buildmatrix_hyp, + ignore = false, + ) + end + end @testset "Dummy Coding Tests" begin for k in 2:5 # Testing for various numbers of levels contrast_matrix = get_dummy_contrast(k) - expected_matrix = Matrix(1.0I, k, k-1) + expected_matrix = Matrix(1.0I, k, k - 1) @test contrast_matrix == expected_matrix - @test size(contrast_matrix) == (k, k-1) + @test size(contrast_matrix) == (k, k - 1) end # test that fit is correct for dummy Coding - cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy) + cache = contrast_encoder_fit(X, [:name]; ignore = false, mode = :dummy) k = length(levels(X.name)) contrast_matrix = get_dummy_contrast(k) - print() for (i, level) in enumerate(levels(X.name)) - println(cache[:vector_given_value_given_feature]) - @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] + @test cache.vector_given_value_given_feature[:name][level] == contrast_matrix[i, :] end end @testset "Sum Coding Tests" begin # Manually define the expected matrix for a 4-level categorical variable - expected_matrix_4 = [1.0 0.0 0.0; - 0.0 1.0 0.0; - 0.0 0.0 1.0; - -1.0 -1.0 -1.0] # Sum of each column for the first three rows is zeroed by the last row + expected_matrix_4 = [ 1.0 0.0 0.0; + 0.0 1.0 0.0; + 0.0 0.0 1.0; + -1.0 -1.0 -1.0] # Sum of each column for the first three rows is zeroed by the last row contrast_matrix_4 = get_sum_contrast(4) @test contrast_matrix_4 == expected_matrix_4 @test size(contrast_matrix_4) == (4, 3) # Additional tests can be included for different levels, with each matrix defined manually # Example for 3 levels - expected_matrix_3 = [1.0 0.0; - 0.0 1.0; - -1.0 -1.0] + expected_matrix_3 = [ 1.0 0.0; + 0.0 1.0; + -1.0 -1.0] contrast_matrix_3 = get_sum_contrast(3) @test contrast_matrix_3 == expected_matrix_3 @test size(contrast_matrix_3) == (3, 2) # test that fit is correct for sum Coding - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :sum) + cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :sum) k = length(levels(X.favnum)) contrast_matrix = get_sum_contrast(k) for (i, level) in enumerate(levels(X.favnum)) - @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] + @test cache.vector_given_value_given_feature[:favnum][level] == + contrast_matrix[i, :] end end @testset "Backward Difference Coding Tests" begin # Manually define the expected matrix for a 4 level categorical variable expected_matrix_4 = [-0.75 -0.5 -0.25; - 0.25 -0.5 -0.25; - 0.25 0.5 -0.25; - 0.25 0.5 0.75] + 0.25 -0.5 -0.25; + 0.25 0.5 -0.25; + 0.25 0.5 0.75] contrast_matrix_4 = get_backward_diff_contrast(4) @test contrast_matrix_4 == expected_matrix_4 @test size(contrast_matrix_4) == (4, 3) # Test that fit is correct for backward Coding - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :backward_diff) + cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :backward_diff) k = length(levels(X.favnum)) contrast_matrix = get_backward_diff_contrast(k) for (i, level) in enumerate(levels(X.favnum)) - @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] + @test cache.vector_given_value_given_feature[:favnum][level] == + contrast_matrix[i, :] end end @@ -110,15 +143,16 @@ end backward_matrix = get_backward_diff_contrast(k) forward_matrix = get_forward_diff_contrast(k) @test forward_matrix == -backward_matrix - @test size(forward_matrix) == (k, k-1) + @test size(forward_matrix) == (k, k - 1) end # Test that fit is correct for forward Coding - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :forward_diff) + cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :forward_diff) k = length(levels(X.favnum)) contrast_matrix = get_forward_diff_contrast(k) for (i, level) in enumerate(levels(X.favnum)) - @test cache[:vector_given_value_given_feature][:favnum][level] == contrast_matrix[i, :] + @test cache.vector_given_value_given_feature[:favnum][level] == + contrast_matrix[i, :] end end @@ -130,18 +164,18 @@ end @test create_helmert_vector(1, 3) == [-1.0, 1.0, 0.0] @test create_helmert_vector(2, 3) == [-1.0, -1.0, 2.0] k = 4 - @test get_helmert_contrast(k) == [ - -1.0 -1.0 -1.0 - 1.0 -1.0 -1.0 - 0.0 2.0 -1.0 - 0.0 0.0 3.0] - # test that fit is correct for helmert Coding - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode = :helmert) - k = length(levels(X.name)) - contrast_matrix = get_helmert_contrast(k) - for (i, level) in enumerate(levels(X.name)) - @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] - end + @test get_helmert_contrast(k) == [ + -1.0 -1.0 -1.0 + 1.0 -1.0 -1.0 + 0.0 2.0 -1.0 + 0.0 0.0 3.0] + # test that fit is correct for helmert Coding + cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :helmert) + k = length(levels(X.name)) + contrast_matrix = get_helmert_contrast(k) + for (i, level) in enumerate(levels(X.name)) + @test cache.vector_given_value_given_feature[:name][level] == contrast_matrix[i, :] + end end @@ -150,23 +184,27 @@ end function buildrandomcontrast(colname, k) - return rand(StableRNGs.StableRNG(123), k, k-1) + return rand(StableRNGs.StableRNG(123), k, k - 1) end - cache = contrast_encoder_fit(X; mode=:contrast, buildmatrix=buildrandomcontrast) + cache = contrast_encoder_fit(X; mode = :contrast, buildmatrix = buildrandomcontrast) X_tr = contrast_encoder_transform(X, cache) - X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] + X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] df = DataFrame(X) - mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 3)), - :favnum=> StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 4)) - )) + mf = ModelFrame( + @formula(age ~ (name + height + favnum)), + df, + contrasts = Dict( + :name => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 3)), + :favnum => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 4)), + ), + ) - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj == X_tr_sm end @@ -175,31 +213,43 @@ end @testset "hypothesis mode end-to-end test" begin function buildrandomhypothesis(colname, k) - return rand(StableRNGs.StableRNG(123), k-1, k) - end + return rand(StableRNGs.StableRNG(123), k - 1, k) + end - cache = contrast_encoder_fit(X; mode=:hypothesis, buildmatrix=buildrandomhypothesis) + cache = contrast_encoder_fit(X; mode = :hypothesis, buildmatrix = buildrandomhypothesis) X_tr = contrast_encoder_transform(X, cache) - X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] + X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] df = DataFrame(X) - mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => HypothesisCoding(buildrandomhypothesis(nothing, 3); levels=levels(X.name), labels=[]), - :favnum=> HypothesisCoding(buildrandomhypothesis(nothing, 4); levels=levels(X.favnum), labels=[]) - )) + mf = ModelFrame( + @formula(age ~ (name + height + favnum)), + df, + contrasts = Dict( + :name => HypothesisCoding( + buildrandomhypothesis(nothing, 3); + levels = levels(X.name), + labels = [], + ), + :favnum => HypothesisCoding( + buildrandomhypothesis(nothing, 4); + levels = levels(X.favnum), + labels = [], + ), + ), + ) - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj == X_tr_sm end function buildrandomhypothesis(colname, k) - return rand(StableRNGs.StableRNG(123), k-1, k) -end + return rand(StableRNGs.StableRNG(123), k - 1, k) +end function buildrandomcontrast(colname, k) - return rand(StableRNGs.StableRNG(123), k, k-1) + return rand(StableRNGs.StableRNG(123), k, k - 1) end @testset "single-mode end-to-end test with StatsModels" begin @@ -207,29 +257,38 @@ end for ind in 1:6 stats_models(k, ind) = [ StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), - DummyCoding(; base=(k == 3) ? "Mary" : 10), - EffectsCoding(; base=(k == 3) ? "Mary" : 10), + DummyCoding(; base = (k == 3) ? "Mary" : 10), + EffectsCoding(; base = (k == 3) ? "Mary" : 10), SeqDiffCoding(), HelmertCoding(), - HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), + HypothesisCoding( + buildrandomhypothesis(nothing, k); + levels = (k == 3) ? levels(X.name) : levels(X.favnum), + labels = [], + ), ][ind] modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] - matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] + matrix_func = + [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] # Try MLJTransforms - cache = contrast_encoder_fit(X; mode=modes[ind], buildmatrix=matrix_func[ind]) + cache = contrast_encoder_fit(X; mode = modes[ind], buildmatrix = matrix_func[ind]) X_tr = contrast_encoder_transform(X, cache) df = DataFrame(X) - mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => stats_models(3, ind), - :favnum=> stats_models(4, ind), - )) - - X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] - X_tr_sm = ModelMatrix(mf).m[:, 2:end] - @test X_tr_mlj ≈ X_tr_sm + mf = ModelFrame( + @formula(age ~ (name + height + favnum)), + df, + contrasts = Dict( + :name => stats_models(3, ind), + :favnum => stats_models(4, ind), + ), + ) + + X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] + X_tr_sm = ModelMatrix(mf).m[:, 2:end] + @test X_tr_mlj ≈ X_tr_sm end end @@ -239,31 +298,52 @@ end for ind2 in 2:5 stats_models(k, ind) = [ StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), - DummyCoding(; base=(k == 3) ? "Mary" : 10), - EffectsCoding(; base=(k == 3) ? "Mary" : 10), + DummyCoding(; base = (k == 3) ? "Mary" : 10), + EffectsCoding(; base = (k == 3) ? "Mary" : 10), SeqDiffCoding(), HelmertCoding(), - HypothesisCoding(buildrandomhypothesis(nothing, k); levels=(k == 3) ? levels(X.name) : levels(X.favnum), labels=[]), + HypothesisCoding( + buildrandomhypothesis(nothing, k); + levels = (k == 3) ? levels(X.name) : levels(X.favnum), + labels = [], + ), ][ind] modes = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] - matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] + matrix_func = [ + buildrandomcontrast, + nothing, + nothing, + nothing, + nothing, + buildrandomhypothesis, + ] # Try MLJTransforms - cache = contrast_encoder_fit(X, [:name, :favnum]; ignore=false, mode=[modes[ind1], modes[ind2]], buildmatrix=matrix_func[ind1]) + cache = contrast_encoder_fit( + X, + [:name, :favnum]; + ignore = false, + mode = [modes[ind1], modes[ind2]], + buildmatrix = matrix_func[ind1], + ) X_tr = contrast_encoder_transform(X, cache) df = DataFrame(X) - mf = ModelFrame(@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => stats_models(3, ind1), - :favnum=> stats_models(4, ind2), - )) + mf = ModelFrame( + @formula(age ~ (name + height + favnum)), + df, + contrasts = Dict( + :name => stats_models(3, ind1), + :favnum => stats_models(4, ind2), + ), + ) - X_tr_mlj = Tables.matrix(X_tr)[:,1:end-1] - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] + X_tr_sm = ModelMatrix(mf).m[:, 2:end] - @test X_tr_mlj ≈ X_tr_sm + @test X_tr_mlj ≈ X_tr_sm end end end @@ -285,8 +365,45 @@ end # fitted parameters is correct vector_given_value_given_feature = fitted_params(mach).vector_given_value_given_feature - @test vector_given_value_given_feature == generic_cache[:vector_given_value_given_feature] + @test vector_given_value_given_feature == generic_cache.vector_given_value_given_feature # Test report - @test report(mach) == (encoded_features = generic_cache[:encoded_features],) + @test report(mach) == (encoded_features = generic_cache.encoded_features,) +end + + +@testset "Test Contrast Encoder Output Types" begin + X = ( + name = categorical(["Ben", "John", "Mary", "John"]), + height = [1.85, 1.67, 1.5, 1.67], + favnum = categorical([7, 5, 10, 1]), + age = [23, 23, 14, 23], + ) + + methods = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] + matrix_func = + [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] + + for (i, method) in enumerate(methods) + encoder = ContrastEncoder( + features = [:name, :favnum], + ignore = false, + mode = method, + buildmatrix = matrix_func[i], + ) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + + # Test Consistency with Types + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + + # Check scitypes for previously continuos or categorical features + @test all(scs[1:end-1] .== Continuous) + @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1]) + # Check scitypes for previously Count feature + last_type, last_sctype = ts[end], scs[end] + @test last_type <: Integer && isconcretetype(last_type) + @test last_sctype <: Count + end end \ No newline at end of file diff --git a/test/encoders/frequency_encoder.jl b/test/encoders/frequency_encoder.jl index 555b9f1..e08eefb 100644 --- a/test/encoders/frequency_encoder.jl +++ b/test/encoders/frequency_encoder.jl @@ -9,7 +9,8 @@ using MLJTransforms: frequency_encoder_fit, frequency_encoder_transform for norm in normalize result = frequency_encoder_fit(X; normalize = norm)[:statistic_given_feat_val] enc = - (col, level) -> ((norm) ? sum(col .== level) / length(col) : sum(col .== level)) + (col, level) -> + Float32((norm) ? sum(col .== level) / length(col) : sum(col .== level)) true_output = Dict{Symbol, Dict{Any, Any}}( :F => Dict( "m" => enc(F_col, "m"), @@ -44,7 +45,9 @@ end X_tr = frequency_encoder_transform(X, cache) enc = (col, level) -> - ((norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level)) + Float32( + (norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level), + ) target = ( A = [enc(:A, X[:A][i]) for i in 1:10], @@ -76,9 +79,47 @@ end # fitted parameters is correct statistic_given_feat_val = fitted_params(mach).statistic_given_feat_val - @test statistic_given_feat_val == generic_cache[:statistic_given_feat_val] + @test statistic_given_feat_val == generic_cache.statistic_given_feat_val # Test report - @test report(mach) == (encoded_features = generic_cache[:encoded_features],) + @test report(mach) == (encoded_features = generic_cache.encoded_features,) end -end \ No newline at end of file +end + +@testset "Test Frequency Encoding Output Types" begin + # Define categorical features + A = ["g", "b", "g", "r", "r"] + B = [1.0, 2.0, 3.0, 4.0, 5.0] + C = ["f", "f", "f", "m", "f"] + D = [true, false, true, false, true] + E = [1, 2, 3, 4, 5] + + # Combine into a named tuple + X = (A = A, B = B, C = C, D = D, E = E) + + # Coerce A, C, D to multiclass and B to continuous and E to ordinal + X = coerce(X, + :A => Multiclass, + :B => Continuous, + :C => Multiclass, + :D => Multiclass, + :E => OrderedFactor, + ) + + # Check scitype coercions: + schema(X) + + encoder = FrequencyEncoder(ordered_factor = false, normalize = false) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + + + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + # Check scitypes correctness + @test all(scs[1:end-1] .== Continuous) + @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1]) + # Ordinal column should be intact + @test scs[end] === schema(X).scitypes[end] + @test ts[end] == schema(X).types[end] +end diff --git a/test/encoders/missingness_encoding.jl b/test/encoders/missingness_encoding.jl index 4bcc306..ed9cf43 100644 --- a/test/encoders/missingness_encoding.jl +++ b/test/encoders/missingness_encoding.jl @@ -1,21 +1,26 @@ using MLJTransforms: missingness_encoder_fit, missingness_encoder_transform -@testset "Throws errors when needed" begin - @test_throws ArgumentError begin - X = generate_X_with_missingness(;john_name="missing") +@testset "Missingness Encoder Error Handling" begin + # Test COLLISION_NEW_VAL_ME error - when label_for_missing value already exists in levels + @test_throws MLJTransforms.COLLISION_NEW_VAL_ME("missing") begin + X = generate_X_with_missingness(; john_name = "missing") cache = missingness_encoder_fit( X; label_for_missing = Dict(AbstractString => "missing", Char => 'm'), ) end - @test_throws ArgumentError begin + + # Test VALID_TYPES_NEW_VAL_ME error - when label_for_missing key is not a supported type + @test_throws MLJTransforms.VALID_TYPES_NEW_VAL_ME(Bool) begin X = generate_X_with_missingness() cache = missingness_encoder_fit( X; label_for_missing = Dict(AbstractString => "Other", Bool => 'X'), ) end - @test_throws ArgumentError begin + + # Test UNSPECIFIED_COL_TYPE_ME error - when column type isn't in label_for_missing + @test_throws MLJTransforms.UNSPECIFIED_COL_TYPE_ME(Char, Dict(AbstractString => "X")) begin X = generate_X_with_missingness() cache = missingness_encoder_fit( X; @@ -28,15 +33,22 @@ end @testset "Default for Numbers Set Correctly" begin X = generate_X_with_missingness() cache = missingness_encoder_fit(X) - label_for_missing_given_feature = cache[:label_for_missing_given_feature] + label_for_missing_given_feature = cache.label_for_missing_given_feature @test label_for_missing_given_feature[:C][missing] == minimum(levels(X.C)) - 1 end @testset "End-to-end test" begin X = generate_X_with_missingness() - - cache = missingness_encoder_fit(X; label_for_missing = Dict(AbstractString => "missing-item", Char => 'i', Number => -99)) + + cache = missingness_encoder_fit( + X; + label_for_missing = Dict( + AbstractString => "missing-item", + Char => 'i', + Number => -99, + ), + ) X_tr = missingness_encoder_transform(X, cache) for col in [:A, :B, :C, :D, :E] @@ -49,14 +61,18 @@ end @test levels(X[:B]) == levels(X_tr[:B]) @test levels(X[:D]) == levels(X_tr[:D]) end - + @testset "Missingness Encoder Fit" begin X = generate_X_with_missingness() result = missingness_encoder_fit( X; - label_for_missing = Dict(AbstractString => "MissingOne", Char => 'X', Number => -90), + label_for_missing = Dict( + AbstractString => "MissingOne", + Char => 'X', + Number => -90, + ), )[:label_for_missing_given_feature] true_output = Dict{Symbol, Dict{Any, Any}}( @@ -72,12 +88,16 @@ end X = generate_X_with_missingness() cache = missingness_encoder_fit( X; - label_for_missing = Dict(AbstractString => "MissingOne", Char => 'X', Number => -90), + label_for_missing = Dict( + AbstractString => "MissingOne", + Char => 'X', + Number => -90, + ), ) - enc_char = (col, level) -> ismissing(level) ? 'X' : level - enc_num = (col, level) -> ismissing(level) ? -90 : level - enc_str = (col, level) -> ismissing(level) ? "MissingOne" : level + enc_char = (col, level) -> ismissing(level) ? 'X' : level + enc_num = (col, level) -> ismissing(level) ? -90 : level + enc_str = (col, level) -> ismissing(level) ? "MissingOne" : level enc_idn = (col, level) -> level X_tr = missingness_encoder_transform(X, cache) @@ -97,7 +117,7 @@ end ], E = [ enc_char(X[:E], X[:E][i]) for i in 1:7 - ] + ], ) @test isequal(target, X_tr) @@ -105,10 +125,14 @@ end @testset "Schema doesn't change after transform" begin X = generate_X_with_missingness() - + cache = missingness_encoder_fit( X; - label_for_missing = Dict(AbstractString => "MissingOne", Char => 'X', Number => -90), + label_for_missing = Dict( + AbstractString => "MissingOne", + Char => 'X', + Number => -90, + ), ) X_tr = missingness_encoder_transform(X, cache) @@ -126,7 +150,11 @@ end cache = missingness_encoder_fit( X; - label_for_missing = Dict(AbstractString => "MissingOne", Char => 'X', Number => -90), + label_for_missing = Dict( + AbstractString => "MissingOne", + Char => 'X', + Number => -90, + ), ) X_tr = missingness_encoder_transform(X, cache) @@ -149,8 +177,39 @@ end # fitted parameters is correct label_for_missing_given_feature = fitted_params(mach).label_for_missing_given_feature - @test label_for_missing_given_feature == generic_cache[:label_for_missing_given_feature] + @test label_for_missing_given_feature == generic_cache.label_for_missing_given_feature # Test report - @test report(mach) == (encoded_features = generic_cache[:encoded_features],) -end \ No newline at end of file + @test report(mach) == (encoded_features = generic_cache.encoded_features,) +end + + + +@testset "Test Missingness Encoder Output Types" begin + # Define a table with missing values + Xm = ( + A = categorical(["Ben", "John", missing, missing, "Mary", "John", missing]), + B = [1.85, 1.67, missing, missing, 1.5, 1.67, missing], + C = categorical([7, 5, missing, missing, 10, 0, missing]), + D = categorical([23, 23, 44, 66, 14, 23, missing], ordered = true), + E = categorical([missing, 'g', 'r', missing, 'r', 'g', 'p']), + ) + + encoder = MissingnessEncoder() + mach = fit!(machine(encoder, Xm)) + Xnew = MMI.transform(mach, Xm) + + schema(Xm) + schema(Xnew) + Xnew.B + + scs = schema(Xnew).scitypes + for (i, type) in enumerate(schema(Xm).scitypes) + print(nonmissingtype(type)) + if nonmissingtype(type) <: Multiclass + @test scs[i] <: Multiclass + else + scs[i] == type + end + end +end diff --git a/test/encoders/ordinal_encoding.jl b/test/encoders/ordinal_encoding.jl index a631830..4af6541 100644 --- a/test/encoders/ordinal_encoding.jl +++ b/test/encoders/ordinal_encoding.jl @@ -77,9 +77,60 @@ end # fitted parameters is correct index_given_feat_level = fitted_params(mach).index_given_feat_level - @test index_given_feat_level == generic_cache[:index_given_feat_level] + @test index_given_feat_level == generic_cache.index_given_feat_level # Test report - @test report(mach) == (encoded_features = generic_cache[:encoded_features],) + @test report(mach) == (encoded_features = generic_cache.encoded_features,) end -end \ No newline at end of file +end + + +@testset "Test Ordinal Encoding Types" begin + # Define categorical features + A = ["g", "b", "g", "r", "r"] + B = [1.0, 2.0, 3.0, 4.0, 5.0] + C = ["f", "f", "f", "m", "f"] + D = [true, false, true, false, true] + E = [1, 2, 3, 4, 5] + + # Combine into a named tuple + X = (A = A, B = B, C = C, D = D, E = E) + + # Coerce A, C, D to multiclass and B to continuous and E to ordinal + X = coerce(X, + :A => Multiclass, + :B => Multiclass, + :C => Multiclass, + :D => Continuous, + :E => OrderedFactor, + ) + + + encoder = OrdinalEncoder(ordered_factor = false) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + # Check scitypes for previously continuos or categorical features + @test all(scs[1:end-1] .== Continuous) + @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1]) + # Check that for last column it did not changed + scs[end] === schema(X).scitypes[end] + scs[end] + schema(X).scitypes[end] + + ## Int32 case + encoder = OrdinalEncoder(ordered_factor = false, output_type = Int32) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + # Check scitypes for previously categorical features + @test all(scs[1:end-2] .== Count) + @test all(t -> (t <: Integer) && isconcretetype(t), ts[1:end-2]) + # Check rest of the types + scs[end-1:end] + @test scs[end-1:end] == schema(X).scitypes[end-1:end] + @test ts[end-1:end] == schema(X).types[end-1:end] +end diff --git a/test/encoders/target_encoding.jl b/test/encoders/target_encoding.jl index 4e6b0be..83d167d 100644 --- a/test/encoders/target_encoding.jl +++ b/test/encoders/target_encoding.jl @@ -227,7 +227,7 @@ end target_encoder_fit(X, y; ignore = true, ordered_factor = false) X_tr = target_encoder_transform(X, cache) - enc = (col, level) -> cache[:y_stat_given_feat_level][col][level] + enc = (col, level) -> cache.y_stat_given_feat_level[col][level] target = ( A = [enc(:A, X[:A][i]) for i in 1:10], @@ -257,7 +257,7 @@ end target_encoder_fit(X, y) X_tr = target_encoder_transform(X, cache) - enc = (col, level) -> cache[:y_stat_given_feat_level][col][level] + enc = (col, level) -> cache.y_stat_given_feat_level[col][level] target = ( A = [enc(:A, X[:A][i]) for i in 1:10], @@ -276,23 +276,22 @@ end target_encoder_fit(X, y) X_tr = target_encoder_transform(X, cache) - enc = (col, level) -> cache[:y_stat_given_feat_level][col][level] - + enc = (col, level) -> cache.y_stat_given_feat_level[col][level] target = ( - A_1 = [enc(:A, X[:A][i])[1] for i in 1:10], - A_2 = [enc(:A, X[:A][i])[2] for i in 1:10], - A_3 = [enc(:A, X[:A][i])[3] for i in 1:10], + A_0 = [enc(:A, X[:A][i])[1] for i in 1:10], + A_1 = [enc(:A, X[:A][i])[2] for i in 1:10], + A_2 = [enc(:A, X[:A][i])[3] for i in 1:10], B = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], - C_1 = [enc(:C, X[:C][i])[1] for i in 1:10], - C_2 = [enc(:C, X[:C][i])[2] for i in 1:10], - C_3 = [enc(:C, X[:C][i])[3] for i in 1:10], - D_1 = [enc(:D, X[:D][i])[1] for i in 1:10], - D_2 = [enc(:D, X[:D][i])[2] for i in 1:10], - D_3 = [enc(:D, X[:D][i])[3] for i in 1:10], + C_0 = [enc(:C, X[:C][i])[1] for i in 1:10], + C_1 = [enc(:C, X[:C][i])[2] for i in 1:10], + C_2 = [enc(:C, X[:C][i])[3] for i in 1:10], + D_0 = [enc(:D, X[:D][i])[1] for i in 1:10], + D_1 = [enc(:D, X[:D][i])[2] for i in 1:10], + D_2 = [enc(:D, X[:D][i])[3] for i in 1:10], E = [1, 2, 3, 4, 5, 6, 6, 3, 2, 1], - F_1 = [enc(:F, X[:F][i])[1] for i in 1:10], - F_2 = [enc(:F, X[:F][i])[2] for i in 1:10], - F_3 = [enc(:F, X[:F][i])[3] for i in 1:10], + F_0 = [enc(:F, X[:F][i])[1] for i in 1:10], + F_1 = [enc(:F, X[:F][i])[2] for i in 1:10], + F_2 = [enc(:F, X[:F][i])[3] for i in 1:10], ) for col in keys(target) @test all(X_tr[col] .== target[col]) @@ -329,20 +328,72 @@ end # fitted parameters is correct fitresult = fitted_params(mach) @test fitresult.y_statistic_given_feat_level == - generic_cache[:y_stat_given_feat_level] - @test fitresult.task == generic_cache[:task] + generic_cache.y_stat_given_feat_level + @test fitresult.task == generic_cache.task # Test invalid `m` - @test_throws ArgumentError begin - t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 0.5, m = -5) + invalid_m = -5 + @test_throws MLJTransforms.NON_NEGATIVE_m(invalid_m) begin + t = TargetEncoder( + ignore = true, + ordered_factor = false, + lambda = 0.5, + m = invalid_m, + ) end - # Test invalid `lambda` - @test_throws ArgumentError begin - t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 1.1, m = 1) + # Test invalid `lambda` (value > 1) + invalid_lambda = 1.1 + @test_throws MLJTransforms.INVALID_lambda(invalid_lambda) begin + t = TargetEncoder( + ignore = true, + ordered_factor = false, + lambda = invalid_lambda, + m = 1, + ) end # Test report - @test report(mach) == (encoded_features = generic_cache[:encoded_features],) + @test report(mach) == (encoded_features = generic_cache.encoded_features,) end -end \ No newline at end of file +end + + + +@testset "Test Target Encoding Types" begin + # Define categorical features + A = ["g", "b", "g", "r", "r"] + B = [1.0, 2.0, 3.0, 4.0, 5.0] + C = ["f", "f", "f", "m", "f"] + D = [true, false, true, false, true] + E = [1, 2, 3, 4, 5] + + # Define the target variable + y = ["c1", "c2", "c3", "c1", "c2"] + + # Combine into a named tuple + X = (A = A, B = B, C = C, D = D, E = E) + + # Coerce A, C, D to multiclass and B to continuous and E to ordinal + X = coerce(X, + :A => Multiclass, + :B => Continuous, + :C => Multiclass, + :D => Multiclass, + :E => OrderedFactor, + ) + y = coerce(y, Multiclass) + + encoder = TargetEncoder(ordered_factor = false, lambda = 1.0, m = 0) + mach = fit!(machine(encoder, X, y)) + Xnew = MMI.transform(mach, X) + + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + # Check scitypes for previously continuos or categorical features + @test all(scs[1:end-1] .== Continuous) + @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1]) + @test scs[end] === schema(X).scitypes[end] + @test ts[end] == schema(X).types[end] +end + diff --git a/test/generic.jl b/test/generic.jl index ffaa4ae..6260d94 100644 --- a/test/generic.jl +++ b/test/generic.jl @@ -23,29 +23,64 @@ push!(regression_forms, create_dummy_dataset(:regression, as_dataframe = false)) push!(regression_forms, create_dummy_dataset(:regression, as_dataframe = true)) # Add datasets to the dataset forms vector -push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=false, return_y=false)) -push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=true, return_y=false)) +push!( + dataset_forms, + create_dummy_dataset(:regression, as_dataframe = false, return_y = false), +) +push!( + dataset_forms, + create_dummy_dataset(:regression, as_dataframe = true, return_y = false), +) @testset "Generate New feature names Function Tests" begin - # Test 1: No initial conflicts - @testset "No Initial Conflicts" begin - existing_names = [] - names = generate_new_feat_names("feat", 3, existing_names) - @test names == [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")] + levels = ("A", "B", "C") + + # Test 1: No initial conflicts, indices mode (use_levelnames=false) + @testset "No Initial Conflicts (Indices)" begin + existing_names = Symbol[] + names = generate_new_feat_names( + "feat", + 2, + levels, + existing_names; + use_levelnames = false, + ) + @test names == [Symbol("feat_1"), Symbol("feat_2")] + end + + # Test 2: No conflicts, level-names mode (default use_levelnames=true) + @testset "No Initial Conflicts (Level Names)" begin + existing_names = Symbol[] + names = generate_new_feat_names("feat", 3, levels, existing_names) + @test names == [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")] + end + + # Test 3: Handle initial conflict by adding underscores (indices) + @testset "Initial Conflict Resolution (Indices)" begin + existing_names = [Symbol("feat_1"), Symbol("feat_2")] + names = generate_new_feat_names( + "feat", + 2, + levels, + existing_names; + use_levelnames = false, + ) + @test names == [Symbol("feat__1"), Symbol("feat__2")] end - # Test 2: Handle initial conflict by adding underscores - @testset "Initial Conflict Resolution" begin - existing_names = [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")] - names = generate_new_feat_names("feat", 3, existing_names) - @test names == [Symbol("feat__1"), Symbol("feat__2"), Symbol("feat__3")] + # Test 4: Handle initial conflict by adding underscores (level names) + @testset "Initial Conflict Resolution (Level Names)" begin + existing_names = [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")] + names = generate_new_feat_names("feat", 3, levels, existing_names) + @test names == [Symbol("feat__A"), Symbol("feat__B"), Symbol("feat__C")] end end + # Dummy encoder that maps each level to its hash (some arbitrary function) function dummy_encoder_fit( X, - features::AbstractVector{Symbol} = Symbol[]; + features = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, ) @@ -62,14 +97,15 @@ function dummy_encoder_fit( X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper, ) - cache = Dict( - :hash_given_feat_val => hash_given_feat_val, + cache = ( + hash_given_feat_val = hash_given_feat_val, + encoded = encoded_features, ) return cache end -function dummy_encoder_transform(X, cache::Dict) - hash_given_feat_val = cache[:hash_given_feat_val] +function dummy_encoder_transform(X, cache::NamedTuple) + hash_given_feat_val = cache.hash_given_feat_val return generic_transform(X, hash_given_feat_val) end @@ -81,20 +117,24 @@ end # test exclude features feat_names = Tables.schema(X).names ignore_cols = [rand(feat_names), rand(feat_names)] - hash_given_feat_val = dummy_encoder_fit(X, ignore_cols; ignore = true, ordered_factor = false)[:hash_given_feat_val] + hash_given_feat_val = + dummy_encoder_fit(X, ignore_cols; ignore = true, ordered_factor = false)[:hash_given_feat_val] @test intersect(keys(hash_given_feat_val), ignore_cols) == Set() # test include features feat_names = [:A, :C, :D, :F] # these are multiclass include_cols = [rand(feat_names), rand(feat_names)] - hash_given_feat_val2 = dummy_encoder_fit(X, include_cols; ignore = false, ordered_factor = false)[:hash_given_feat_val] + hash_given_feat_val2 = + dummy_encoder_fit(X, include_cols; ignore = false, ordered_factor = false)[:hash_given_feat_val] @test intersect(keys(hash_given_feat_val2), include_cols) == Set(include_cols) # test types of encoded features feat_names = Tables.schema(X).names - hash_given_feat_val = dummy_encoder_fit(X, Symbol[]; ignore = true, ordered_factor = false)[:hash_given_feat_val] + hash_given_feat_val = + dummy_encoder_fit(X, Symbol[]; ignore = true, ordered_factor = false)[:hash_given_feat_val] @test !(:E in keys(hash_given_feat_val)) - hash_given_feat_val = dummy_encoder_fit(X, Symbol[]; ignore = true, ordered_factor = true)[:hash_given_feat_val] + hash_given_feat_val = + dummy_encoder_fit(X, Symbol[]; ignore = true, ordered_factor = true)[:hash_given_feat_val] @test (:E in keys(hash_given_feat_val)) end @@ -115,15 +155,15 @@ end "g" => enc(A_col, "g"), "b" => enc(A_col, "b"), "r" => enc(A_col, "r"), - ), + ), :D => Dict( false => enc(D_col, false), true => enc(D_col, true), - ), + ), :C => Dict( "f" => enc(C_col, "f"), "m" => enc(C_col, "m"), - ), + ), ) @test result == true_output end @@ -141,7 +181,44 @@ end C = [enc(:C, X[:C][i]) for i in 1:10], D = [enc(:D, X[:D][i]) for i in 1:10], E = [1, 2, 3, 4, 5, 6, 6, 3, 2, 1], - F = [enc(:F, X[:F][i]) for i in 1:10] + F = [enc(:F, X[:F][i]) for i in 1:10], ) @test X_tr == target -end \ No newline at end of file +end + +@testset "Callable feature functionality tests" begin + X = dataset_forms[1] + feat_names = Tables.schema(X).names + + # Define a predicate: include only columns with name in uppercase list [:A, :C, :E] + predicate = name -> name in [:A, :C, :E] + + # Test 1: ignore=true should exclude predicate columns + cache1 = dummy_encoder_fit(X, predicate; ignore = true, ordered_factor = false) + @test !(:A in cache1[:encoded]) && !(:C in cache1[:encoded]) && + !(:E in cache1[:encoded]) + + # Test 2: ignore=false should include only predicate columns + cache2 = dummy_encoder_fit(X, predicate; ignore = false, ordered_factor = false) + @test Set(cache2[:encoded]) == Set([:A, :C]) + + # Test 3: predicate with ordered_factor=true picks up ordered factors (e.g., :E) + cache3 = dummy_encoder_fit(X, predicate; ignore = false, ordered_factor = true) + @test Set(cache3[:encoded]) == Set([:A, :C, :E]) +end + +@testset "Single Symbol and list of one symbol equivalence" begin + X = dataset_forms[1] + feat_names = Tables.schema(X).names + + # Test 1: Single Symbol + single_symbol = :A + cache1 = dummy_encoder_fit(X, single_symbol; ignore = true, ordered_factor = false) + @test !(:A in cache1[:encoded]) + # Test 2: List of one symbol + single_symbol_list = [:A] + cache2 = dummy_encoder_fit(X, single_symbol_list; ignore = true, ordered_factor = false) + @test !(:A in cache2[:encoded]) + # Test 3: Both should yield the same result + @test cache1[:encoded] == cache2[:encoded] +end diff --git a/test/runtests.jl b/test/runtests.jl index b867435..d8b0f5a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,7 @@ using StatsModels # Other transformers using Tables, CategoricalArrays -using ScientificTypes: scitype +using ScientificTypes: scitype, schema using Statistics using StableRNGs stable_rng = StableRNGs.StableRNG(123) diff --git a/test/transformers/cardinality_reducer.jl b/test/transformers/cardinality_reducer.jl index 6670306..dbab08b 100644 --- a/test/transformers/cardinality_reducer.jl +++ b/test/transformers/cardinality_reducer.jl @@ -1,40 +1,44 @@ -using MLJTransforms: union_types, cardinality_reducer_fit, cardinality_reducer_transform +using MLJTransforms: cardinality_reducer_fit, cardinality_reducer_transform -@testset "Union_types" begin - @test union_types(Union{Integer, String}) == (Integer, String) -end -@testset "Throws errors when needed" begin - @test_throws ArgumentError begin +@testset "Cardinality Reducer Error Handling" begin + # Test COLLISION_NEW_VAL error - when label_for_infrequent value already exists in data + @test_throws MLJTransforms.COLLISION_NEW_VAL('X') begin X = generate_high_cardinality_table(1000; obj = false, special_cat = 'X') cache = cardinality_reducer_fit( X; label_for_infrequent = Dict(AbstractString => "Other", Char => 'X'), ) end - @test_throws ArgumentError begin + + # Test VALID_TYPES_NEW_VAL error - when label_for_infrequent key is not a supported type + @test_throws MLJTransforms.VALID_TYPES_NEW_VAL(Bool) begin X = generate_high_cardinality_table(1000; obj = false, special_cat = 'O') cache = cardinality_reducer_fit( X; label_for_infrequent = Dict(AbstractString => "Other", Bool => 'X'), ) end - @test_throws ArgumentError begin + + # Test UNSPECIFIED_COL_TYPE error - when column type isn't in label_for_infrequent + @test_throws MLJTransforms.UNSPECIFIED_COL_TYPE(Char, Dict(AbstractString => "X")) begin X = generate_high_cardinality_table(1000) cache = cardinality_reducer_fit( X; min_frequency = 30, label_for_infrequent = Dict(AbstractString => "X"), + # Missing Char type in label_for_infrequent, which should be present in X ) end + end @testset "Default for Numbers Set Correctly" begin X = generate_high_cardinality_table(1000) cache = cardinality_reducer_fit(X; min_frequency = 0.2) - new_cat_given_col_val = cache[:new_cat_given_col_val] - + new_cat_given_col_val = cache.new_cat_given_col_val + @test minimum(values(new_cat_given_col_val[:HighCardFeature1])) == minimum(levels(X.HighCardFeature1)) - 1 end @@ -55,7 +59,7 @@ end @testset "End-to-end test" begin X = generate_high_cardinality_table(1000) - + for min_frequency in [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6] cache = cardinality_reducer_fit(X; min_frequency = min_frequency) X_tr = cardinality_reducer_transform(X, cache) @@ -64,7 +68,8 @@ end new_prop_map = proportionmap(X_tr[!, col]) for val in values(new_prop_map) # all new cateogories except at most the new one should satisfy min_frequency - @test sum(values(new_prop_map) .>= min_frequency) >= length(values(new_prop_map)) - 1 + @test sum(values(new_prop_map) .>= min_frequency) >= + length(values(new_prop_map)) - 1 end end @@ -81,7 +86,11 @@ end result = cardinality_reducer_fit( X; min_frequency = 0.3, - label_for_infrequent = Dict(AbstractString => "OtherOne", Char => 'X', Number => -99), + label_for_infrequent = Dict( + AbstractString => "OtherOne", + Char => 'X', + Number => -99, + ), )[:new_cat_given_col_val] enc_char = (col, level) -> (proportionmap(col)[level] >= 0.3 ? level : 'X') @@ -92,19 +101,22 @@ end :LowCardFeature => Dict( [ (level, enc_char(LowCardFeature_col, level)) for - level in levels(LowCardFeature_col) if proportionmap(LowCardFeature_col)[level] < 0.3 + level in levels(LowCardFeature_col) if + proportionmap(LowCardFeature_col)[level] < 0.3 ], ), :HighCardFeature1 => Dict( [ (level, enc_num(HighCardFeature1_col, level)) for - level in levels(HighCardFeature1_col) if proportionmap(HighCardFeature1_col)[level] < 0.3 + level in levels(HighCardFeature1_col) if + proportionmap(HighCardFeature1_col)[level] < 0.3 ], ), :HighCardFeature2 => Dict( [ (level, enc_str(HighCardFeature2_col, level)) for - level in levels(HighCardFeature2_col) if proportionmap(HighCardFeature2_col)[level] < 0.3 + level in levels(HighCardFeature2_col) if + proportionmap(HighCardFeature2_col)[level] < 0.3 ], ), ) @@ -119,7 +131,11 @@ end cache = cardinality_reducer_fit( X; min_frequency = 0.3, - label_for_infrequent = Dict(AbstractString => "OtherOne", Char => 'X', Number => -99), + label_for_infrequent = Dict( + AbstractString => "OtherOne", + Char => 'X', + Number => -99, + ), ) enc_char = (col, level) -> (proportionmap(col)[level] >= 0.3 ? level : 'X') @@ -149,7 +165,11 @@ end cache = cardinality_reducer_fit( X; min_frequency = 0.1, - label_for_infrequent = Dict(AbstractString => "OtherOne", Char => 'X', Number => -99), + label_for_infrequent = Dict( + AbstractString => "OtherOne", + Char => 'X', + Number => -99, + ), ) X_tr = cardinality_reducer_transform(X, cache) @test elscitype(X_tr[:LowCardFeature]) <: Multiclass @@ -161,9 +181,13 @@ end X = Tables.columntable(generate_high_cardinality_table(10)) levels!(Tables.getcolumn(X, :LowCardFeature), ['A', 'B', 'C', 'D', 'E', 'Z']) - cache = cardinality_reducer_fit( + cache = cardinality_reducer_fit( X; - label_for_infrequent = Dict(AbstractString => "OtherOne", Char => 'X', Number => -90), + label_for_infrequent = Dict( + AbstractString => "OtherOne", + Char => 'X', + Number => -90, + ), ) X_tr = cardinality_reducer_transform(X, cache) @@ -173,10 +197,15 @@ end @testset "MLJ Interface Cardinality Reducer" begin X = generate_high_cardinality_table(1000) # functional api - generic_cache = cardinality_reducer_fit(X; min_frequency=0.1, ignore = true, ordered_factor = false) + generic_cache = cardinality_reducer_fit( + X; + min_frequency = 0.1, + ignore = true, + ordered_factor = false, + ) X_transf = cardinality_reducer_transform(X, generic_cache) # mlj api - encoder = CardinalityReducer(min_frequency=0.1, ignore = true, ordered_factor = false) + encoder = CardinalityReducer(min_frequency = 0.1, ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) Xnew_transf = MMI.transform(mach, X) @@ -186,11 +215,36 @@ end # fitted parameters is correct new_cat_given_col_val = fitted_params(mach).new_cat_given_col_val - @test new_cat_given_col_val == generic_cache[:new_cat_given_col_val] + @test new_cat_given_col_val == generic_cache.new_cat_given_col_val # Test report - @test report(mach) == (encoded_features = generic_cache[:encoded_features],) + @test report(mach) == (encoded_features = generic_cache.encoded_features,) end + +@testset "Test Cardinality Reducer Output Types" begin + # Define categorical features + A = [["a" for i in 1:100]..., "b", "b", "b", "c", "d"] + B = [[0 for i in 1:100]..., 1, 2, 3, 4, 4] + + # Combine into a named tuple + X = (A = A, B = B) + + # Coerce A, C, D to multiclass and B to continuous and E to ordinal + X = coerce(X, + :A => Multiclass, + :B => Multiclass, + ) + + levels(X.A) + + encoder = CardinalityReducer(ordered_factor = false, min_frequency = 3) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + @test schema(X).types == schema(Xnew).types + @test all(s -> (s <: Multiclass), schema(Xnew).scitypes) +end + + # Look into MLJModelInterfaceTest # Add tests to ensure categorical feature properties are as expected \ No newline at end of file