diff --git a/.gitignore b/.gitignore index 5189df2..faf2005 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,5 @@ scratchpad/ examples/test.jl catboost_info/** /catboost_info -/catboost_info -/docs/src/tutorials/adult_example/.CondaPkg -/docs/src/tutorials/adult_example/catboost_info +/docs/src/tutorials/**/.CondaPkg +/docs/src/tutorials/**/catboost_info diff --git a/Project.toml b/Project.toml index 0c1db3e..f1d1904 100644 --- a/Project.toml +++ b/Project.toml @@ -23,7 +23,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] BitBasis = "0.9" -CategoricalArrays = "0.10" +CategoricalArrays = "1" Combinatorics = "1" Dates = "1" Distributions = "0.25" diff --git a/docs/Project.toml b/docs/Project.toml index a9d6acf..43f6c63 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,9 +1,6 @@ [deps] -CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8" -MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845" MLJTransforms = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6" diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl index 4c2968b..dd4a051 100644 --- a/src/MLJTransforms.jl +++ b/src/MLJTransforms.jl @@ -20,6 +20,9 @@ using OrderedCollections const MMI = MLJModelInterface +# old behaviour of `levels` (before CategoricalArrays 1.0): +rawlevels(A) = unwrap.(levels(A)) + # Functions of generic use across transformers include("common_docs.jl") include("generic.jl") diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl index 63db2e5..c801554 100644 --- a/src/encoders/contrast_encoder/contrast_encoder.jl +++ b/src/encoders/contrast_encoder/contrast_encoder.jl @@ -102,7 +102,7 @@ function contrast_encoder_fit( # ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis function feature_mapper(col, name) - feat_levels = levels(col) + feat_levels = rawlevels(col) k = length(feat_levels) feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode if feat_mode == :contrast diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl index 1b3916a..08926e8 100644 --- a/src/encoders/frequency_encoding/frequency_encoding.jl +++ b/src/encoders/frequency_encoding/frequency_encoding.jl @@ -29,7 +29,7 @@ function frequency_encoder_fit( # 1. Define feature mapper function feature_mapper(col, name) frequency_map = (!normalize) ? countmap(col) : proportionmap(col) - feat_levels = levels(col) + feat_levels = rawlevels(col) statistic_given_feat_val = Dict{eltype(feat_levels), output_type}( level => get(frequency_map, level, 0) for level in feat_levels ) diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl index b01bd63..e8ad042 100644 --- a/src/encoders/missingness_encoding/missingness_encoding.jl +++ b/src/encoders/missingness_encoding/missingness_encoding.jl @@ -39,7 +39,7 @@ function missingness_encoder_fit( # 1. Define feature mapper function feature_mapper(col, name) - feat_levels = levels(col; skipmissing = true) + feat_levels = unwrap.(levels(col; skipmissing = true)) col_type = nonmissingtype(eltype(feat_levels)) # Ensure column type is valid (can't test because never occurs) diff --git a/src/encoders/ordinal_encoding/ordinal_encoding.jl b/src/encoders/ordinal_encoding/ordinal_encoding.jl index 9d3d765..0db24d5 100644 --- a/src/encoders/ordinal_encoding/ordinal_encoding.jl +++ b/src/encoders/ordinal_encoding/ordinal_encoding.jl @@ -25,7 +25,7 @@ function ordinal_encoder_fit( ) # 1. Define feature mapper function feature_mapper(col, name) - feat_levels = levels(col) + feat_levels = rawlevels(col) index_given_feat_val = Dict{eltype(feat_levels), output_type}( value => index for (index, value) in enumerate(feat_levels) diff --git a/src/encoders/target_encoding/target_encoding.jl b/src/encoders/target_encoding/target_encoding.jl index e99ec6f..bad986c 100644 --- a/src/encoders/target_encoding/target_encoding.jl +++ b/src/encoders/target_encoding/target_encoding.jl @@ -148,12 +148,12 @@ function target_encoder_fit( "Your target must be Continuous/Count for regression or Multiclass/OrderedFactor for classification", ) - # 2. Setup prior statistics + # 2. Setup prior statistics if task == "Regression" y_mean = mean(y) # for mixing m == :auto && (y_var = std(y)^2) # for empirical Bayes estimation else - y_classes = levels(y) + y_classes = rawlevels(y) is_multiclass = length(y_classes) > 2 if !is_multiclass # binary case y_prior = sum(y .== y_classes[1]) / length(y) # for mixing @@ -165,10 +165,10 @@ function target_encoder_fit( # 3. Define function to compute the new value(s) for each level given a column function feature_mapper(col, name) - feat_levels = levels(col) + feat_levels = rawlevels(col) y_stat_given_feat_level_for_col = Dict{eltype(feat_levels), Any}() - for level in levels(col) + for level in rawlevels(col) # Get the targets of an example that belong to this level targets_for_level = y[col.==level] @@ -230,14 +230,14 @@ end Transform given data with fitted target encoder cache. # Arguments -- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) +- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor` -- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for +- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for every categorical feature as well as other metadata needed for transform # Returns - `X`: A table where the categorical features as specified during fitting are transformed by target encoding. Other features will remain - the same. This will attempt to preserve the type of the table but may not succeed. + the same. This will attempt to preserve the type of the table but may not succeed. """ function target_encoder_transform(X, cache) @@ -253,4 +253,3 @@ function target_encoder_transform(X, cache) use_levelnames = true, custom_levels = y_classes) end - diff --git a/src/generic.jl b/src/generic.jl index b5bed13..39b7968 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -12,12 +12,13 @@ generic_fit(X, ) ``` -Given a `feature_mapper` (see definition below), this method applies - `feature_mapper` across a specified subset of categorical columns in X and returns a dictionary - whose keys are the feature names, and each value is the corresponding - level‑to‑value mapping produced by `feature_mapper`. +Given a `feature_mapper` (see definition below), this method applies `feature_mapper` +across a specified subset of categorical columns in X and returns a dictionary whose keys +are the feature names, and each value is the corresponding level‑to‑value mapping produced +by `feature_mapper`. -In essence, it spares effort of looping over each column and applying the `feature_mapper` function manually as well as handling the feature selection logic. +In essence, it spares effort of looping over each column and applying the `feature_mapper` +function manually as well as handling the feature selection logic. # Arguments @@ -26,17 +27,22 @@ $X_doc $features_doc $ignore_doc $ordered_factor_doc -- feature_mapper: function that, for a given vector (eg, corresponding to a categorical column from the dataset `X`), - produces a mapping from each category level name in this vector to a scalar or vector according to specified transformation logic. + +- feature_mapper: function that, for a given vector (eg, corresponding to a categorical + column from the dataset `X`), produces a mapping from each category level name in this + vector to a scalar or vector according to specified transformation logic. # Note -- Any additional arguments (whether keyword or not) provided to this function are passed to the `feature_mapper` function which - is helpful when `feature_mapper` requires additional arguments to compute the mapping (eg, hyperparameters). +- Any additional arguments (whether keyword or not) provided to this function are passed + to the `feature_mapper` function which is helpful when `feature_mapper` requires + additional arguments to compute the mapping (eg, hyperparameters). # Returns -- `mapping_per_feat_level`: Maps each level for each feature in a subset of the categorical features of - X into a scalar or a vector. + +- `mapping_per_feat_level`: Maps each level for each feature in a subset of the + categorical features of X into a scalar or a vector. + $encoded_features_doc """ function generic_fit(X, @@ -50,11 +56,11 @@ function generic_fit(X, # 1. Get X column types and names feat_names = Tables.schema(X).names - #2. Modify column_names based on features + #2. Modify column_names based on features if features isa Symbol features = [features] end - + if features isa AbstractVector{Symbol} # Original behavior for vector of symbols feat_names = @@ -94,8 +100,9 @@ end """ **Private method.** -Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible, -feat_name_level_0, feat_name_level_1,..., feat_name_level_n +Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if +possible, feat_name_level_0, feat_name_level_1,..., feat_name_level_n + """ function generate_new_feat_names( feat_name, @@ -115,7 +122,8 @@ function generate_new_feat_names( suffix = repeat("_", count) if use_levelnames # Always use the first num_inds level names - new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ] + new_column_names = + [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ] else # Always use numeric indices new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ] @@ -144,34 +152,42 @@ generic_transform( ``` -Apply a per‐level feature mapping to selected categorical columns in `X`, returning a new table of the same type. +Apply a per‐level feature mapping to selected categorical columns in `X`, returning a new +table of the same type. # Arguments $X_doc -- `mapping_per_feat_level::Dict{Symbol,Dict}`: - A dict whose keys are feature names (`Symbol`) and values are themselves dictionaries - mapping each observed level to either a scalar (if `single_feat=true`) or a fixed‐length vector - (if `single_feat=false`). Only columns whose names appear in `mapping_per_feat_level` are - transformed; others pass through unchanged. -- `single_feat::Bool=true`: - If `true`, each input level is mapped to a single scalar feature; if `false`, - each input level is mapped to a length‑`k` vector, producing `k` output columns. -- `ignore_unknown::Bool=false`: - If `false`, novel levels in `X` (not seen during fit) will raise an error; - if `true`, novel levels will be left unchanged (identity mapping). -- `use_levelnames::Bool=false`: - When `single_feat=false`, controls naming of the expanded columns: `true`: use actual level names (e.g. `:color_red`, `:color_blue`), - `false`: use numeric indices (e.g. `:color_1`, `:color_2`). -- `custom_levels::Union{Nothing,Vector}`: - If not `nothing`, overrides the names of levels used to generate feature names when `single_feat=false`. -- `ensure_categorical::Bool=false`: - Only when `single_feat=true` and if `true`, preserves the categorical type of the column after - recoding (eg, feature should still be recognized as `Multiclass` after transformation) + +- `mapping_per_feat_level::Dict{Symbol,Dict}`: A dict whose keys are feature names + (`Symbol`) and values are themselves dictionaries mapping each observed level to either + a scalar (if `single_feat=true`) or a fixed‐length vector (if + `single_feat=false`). Only columns whose names appear in `mapping_per_feat_level` are + transformed; others pass through unchanged. + +- `single_feat::Bool=true`: If `true`, each input level is mapped to a single scalar + feature; if `false`, each input level is mapped to a length‑`k` vector, producing `k` + output columns. + +- `ignore_unknown::Bool=false`: If `false`, novel levels in `X` (not seen during fit) will + raise an error; if `true`, novel levels will be left unchanged (identity mapping). + +- `use_levelnames::Bool=false`: When `single_feat=false`, controls naming of the expanded + columns: `true`: use actual level names (e.g. `:color_red`, `:color_blue`), `false`: + use numeric indices (e.g. `:color_1`, `:color_2`). + +- `custom_levels::Union{Nothing,Vector}`: If not `nothing`, overrides the names of levels + used to generate feature names when `single_feat=false`. + +- `ensure_categorical::Bool=false`: Only when `single_feat=true` and if `true`, preserves + the categorical type of the column after recoding (eg, feature should still be + recognized as `Multiclass` after transformation) # Returns -A new table of potentially similar to `X` but with categorical columns transformed according to `mapping_per_feat_level`. +A new table of potentially similar to `X` but with categorical columns transformed +according to `mapping_per_feat_level`. + """ function generic_transform( X, @@ -191,13 +207,14 @@ function generic_transform( if feat_name in keys(mapping_per_feat_level) if !ignore_unknown train_levels = keys(mapping_per_feat_level[feat_name]) - test_levels = levels(col) + test_levels = rawlevels(col) # test levels must be a subset of train levels if !issubset(test_levels, train_levels) # get the levels in test that are not in train lost_levels = setdiff(test_levels, train_levels) error( - "While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.", + "While transforming, found novel levels for the column "* + "$(feat_name): $(lost_levels) that were not seen while training.", ) end end @@ -206,10 +223,11 @@ function generic_transform( level2scalar = mapping_per_feat_level[feat_name] if ensure_categorical new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col - else - new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col + else + new_col = + !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col end - + push!(new_cols, new_col) push!(new_feat_names, feat_name) else @@ -221,7 +239,8 @@ function generic_transform( feat_names_with_inds = generate_new_feat_names( feat_name, length(first(mapping_per_feat_level[feat_name])[2]), - (custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels, + (custom_levels === nothing) ? + keys(mapping_per_feat_level[feat_name]) : custom_levels, feat_names; use_levelnames = use_levelnames, ) diff --git a/src/transformers/cardinality_reducer/cardinality_reducer.jl b/src/transformers/cardinality_reducer/cardinality_reducer.jl index a73e3aa..af78baf 100644 --- a/src/transformers/cardinality_reducer/cardinality_reducer.jl +++ b/src/transformers/cardinality_reducer/cardinality_reducer.jl @@ -46,7 +46,7 @@ function cardinality_reducer_fit( # 1. Define feature mapper function feature_mapper(col, name) val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col) - feat_levels = levels(col) + feat_levels = rawlevels(col) col_type = eltype(feat_levels) # Ensure column type is valid (can't test because never occurs) diff --git a/src/transformers/other_transformers/one_hot_encoder.jl b/src/transformers/other_transformers/one_hot_encoder.jl index 07a3946..10ef2e7 100644 --- a/src/transformers/other_transformers/one_hot_encoder.jl +++ b/src/transformers/other_transformers/one_hot_encoder.jl @@ -61,7 +61,7 @@ function MMI.fit(transformer::OneHotEncoder, verbosity::Int, X) if T <: allowed_scitypes && ftr in specified_features ref_name_pairs_given_feature[ftr] = Pair{<:Unsigned,Symbol}[] shift = transformer.drop_last ? 1 : 0 - levels = classes(col) + levels = CategoricalArrays.levels(col) fitted_levels_given_feature[ftr] = levels if verbosity > 0 @info "Spawning $(length(levels)-shift) sub-features "* @@ -136,7 +136,7 @@ function MMI.transform(transformer::OneHotEncoder, fitresult, X) col = MMI.selectcols(X, ftr) if ftr in features_to_be_transformed Set(fitresult.fitted_levels_given_feature[ftr]) == - Set(classes(col)) || + Set(levels(col)) || error("Found category level mismatch in feature `$(ftr)`. "* "Consider using `levels!` to ensure fitted and transforming "* "features have the same category levels.") @@ -289,4 +289,4 @@ julia> schema(W) See also [`ContinuousEncoder`](@ref). """ -OneHotEncoder \ No newline at end of file +OneHotEncoder diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl index c6052ff..7f3abde 100644 --- a/test/encoders/contrast_encoder.jl +++ b/test/encoders/contrast_encoder.jl @@ -82,9 +82,9 @@ end end # test that fit is correct for dummy Coding cache = contrast_encoder_fit(X, [:name]; ignore = false, mode = :dummy) - k = length(levels(X.name)) + k = length(rawlevels(X.name)) contrast_matrix = get_dummy_contrast(k) - for (i, level) in enumerate(levels(X.name)) + for (i, level) in enumerate(rawlevels(X.name)) @test cache.vector_given_value_given_feature[:name][level] == contrast_matrix[i, :] end end @@ -110,9 +110,9 @@ end @test size(contrast_matrix_3) == (3, 2) # test that fit is correct for sum Coding cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :sum) - k = length(levels(X.favnum)) + k = length(rawlevels(X.favnum)) contrast_matrix = get_sum_contrast(k) - for (i, level) in enumerate(levels(X.favnum)) + for (i, level) in enumerate(rawlevels(X.favnum)) @test cache.vector_given_value_given_feature[:favnum][level] == contrast_matrix[i, :] end @@ -130,9 +130,9 @@ end # Test that fit is correct for backward Coding cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :backward_diff) - k = length(levels(X.favnum)) + k = length(rawlevels(X.favnum)) contrast_matrix = get_backward_diff_contrast(k) - for (i, level) in enumerate(levels(X.favnum)) + for (i, level) in enumerate(rawlevels(X.favnum)) @test cache.vector_given_value_given_feature[:favnum][level] == contrast_matrix[i, :] end @@ -148,9 +148,9 @@ end # Test that fit is correct for forward Coding cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :forward_diff) - k = length(levels(X.favnum)) + k = length(rawlevels(X.favnum)) contrast_matrix = get_forward_diff_contrast(k) - for (i, level) in enumerate(levels(X.favnum)) + for (i, level) in enumerate(rawlevels(X.favnum)) @test cache.vector_given_value_given_feature[:favnum][level] == contrast_matrix[i, :] end @@ -171,9 +171,9 @@ end 0.0 0.0 3.0] # test that fit is correct for helmert Coding cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :helmert) - k = length(levels(X.name)) + k = length(rawlevels(X.name)) contrast_matrix = get_helmert_contrast(k) - for (i, level) in enumerate(levels(X.name)) + for (i, level) in enumerate(rawlevels(X.name)) @test cache.vector_given_value_given_feature[:name][level] == contrast_matrix[i, :] end end @@ -227,12 +227,12 @@ end contrasts = Dict( :name => StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, 3); - levels = levels(X.name), + levels = rawlevels(X.name), labels = [], ), :favnum => StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, 4); - levels = levels(X.favnum), + levels = rawlevels(X.favnum), labels = [], ), ), @@ -263,7 +263,7 @@ end StatsModels.HelmertCoding(), StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, k); - levels = (k == 3) ? levels(X.name) : levels(X.favnum), + levels = (k == 3) ? rawlevels(X.name) : rawlevels(X.favnum), labels = [], ), ][ind] @@ -304,7 +304,7 @@ end StatsModels.HelmertCoding(), StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, k); - levels = (k == 3) ? levels(X.name) : levels(X.favnum), + levels = (k == 3) ? rawlevels(X.name) : rawlevels(X.favnum), labels = [], ), ][ind] diff --git a/test/encoders/missingness_encoding.jl b/test/encoders/missingness_encoding.jl index 201ebf8..431e29b 100644 --- a/test/encoders/missingness_encoding.jl +++ b/test/encoders/missingness_encoding.jl @@ -34,7 +34,7 @@ end X = generate_X_with_missingness() cache = missingness_encoder_fit(X) label_for_missing_given_feature = cache.label_for_missing_given_feature - @test label_for_missing_given_feature[:C][missing] == minimum(levels(X.C)) - 1 + @test label_for_missing_given_feature[:C][missing] == minimum(rawlevels(X.C)) - 1 end @@ -52,14 +52,14 @@ end X_tr = missingness_encoder_transform(X, cache) for col in [:A, :B, :C, :D, :E] - @test issubset(levels(X[col]), levels(X_tr[col])) + @test issubset(rawlevels(X[col]), rawlevels(X_tr[col])) end - @test Set(push!(levels(X[:A]), "missing-item")) == Set(levels(X_tr[:A])) - @test Set(push!(levels(X[:C]), -99)) == Set(levels(X_tr[:C])) - @test Set(push!(levels(X[:E]), 'i')) == Set(levels(X_tr[:E])) - @test levels(X[:B]) == levels(X_tr[:B]) - @test levels(X[:D]) == levels(X_tr[:D]) + @test Set(push!(rawlevels(X[:A]), "missing-item")) == Set(rawlevels(X_tr[:A])) + @test Set(push!(rawlevels(X[:C]), -99)) == Set(rawlevels(X_tr[:C])) + @test Set(push!(rawlevels(X[:E]), 'i')) == Set(rawlevels(X_tr[:E])) + @test rawlevels(X[:B]) == rawlevels(X_tr[:B]) + @test rawlevels(X[:D]) == rawlevels(X_tr[:D]) end @@ -158,7 +158,7 @@ end ) X_tr = missingness_encoder_transform(X, cache) - @test issubset(levels(X[:A]), levels(X_tr[:A])) # Will have "MissingOne" added + @test issubset(rawlevels(X[:A]), rawlevels(X_tr[:A])) # Will have "MissingOne" added end @testset "MLJ Interface Missingness Encoder" begin diff --git a/test/encoders/ordinal_encoding.jl b/test/encoders/ordinal_encoding.jl index 314aa4b..fbb7310 100644 --- a/test/encoders/ordinal_encoding.jl +++ b/test/encoders/ordinal_encoding.jl @@ -18,22 +18,22 @@ push!( A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) true_output = Dict{Symbol, Dict{Any, AbstractFloat}}( :F => Dict( - "m" => findfirst(==("m"), levels(F_col)), - "l" => findfirst(==("l"), levels(F_col)), - "s" => findfirst(==("s"), levels(F_col)), + "m" => findfirst(==("m"), rawlevels(F_col)), + "l" => findfirst(==("l"), rawlevels(F_col)), + "s" => findfirst(==("s"), rawlevels(F_col)), ), :A => Dict( - "g" => findfirst(==("g"), levels(A_col)), - "b" => findfirst(==("b"), levels(A_col)), - "r" => findfirst(==("r"), levels(A_col)), + "g" => findfirst(==("g"), rawlevels(A_col)), + "b" => findfirst(==("b"), rawlevels(A_col)), + "r" => findfirst(==("r"), rawlevels(A_col)), ), :D => Dict( - false => findfirst(==(false), levels(D_col)), - true => findfirst(==(true), levels(D_col)), + false => findfirst(==(false), rawlevels(D_col)), + true => findfirst(==(true), rawlevels(D_col)), ), :C => Dict( - "f" => findfirst(==("f"), levels(C_col)), - "m" => findfirst(==("m"), levels(C_col)), + "f" => findfirst(==("f"), rawlevels(C_col)), + "m" => findfirst(==("m"), rawlevels(C_col)), ), ) @test result == true_output @@ -46,7 +46,7 @@ end X_tr = ordinal_encoder_transform(X, cache) - enc = (col, level) -> findfirst(==(level), levels(X[col])) + enc = (col, level) -> findfirst(==(level), rawlevels(X[col])) target = ( A = [enc(:A, X[:A][i]) for i in 1:10], diff --git a/test/generic.jl b/test/generic.jl index 6842ce4..f4c7665 100644 --- a/test/generic.jl +++ b/test/generic.jl @@ -86,7 +86,7 @@ function dummy_encoder_fit( ) # 1. Define feature mapper function feature_mapper(col, name) - feat_levels = levels(col) + feat_levels = rawlevels(col) hash_given_feat_val = Dict{Any, Integer}(value => hash(value) for value in feat_levels) return hash_given_feat_val diff --git a/test/runtests.jl b/test/runtests.jl index 4073427..8925eb8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,6 +9,9 @@ import StatsModels using Random using LinearAlgebra +# old behaviour of `levels` (before CategoricalArrays 1.0): +rawlevels(A) = unwrap.(levels(A)) + # Other transformers using Tables, CategoricalArrays using Statistics diff --git a/test/transformers/cardinality_reducer.jl b/test/transformers/cardinality_reducer.jl index 79386e5..d72d16a 100644 --- a/test/transformers/cardinality_reducer.jl +++ b/test/transformers/cardinality_reducer.jl @@ -40,7 +40,7 @@ end new_cat_given_col_val = cache.new_cat_given_col_val @test minimum(values(new_cat_given_col_val[:HighCardFeature1])) == - minimum(levels(X.HighCardFeature1)) - 1 + minimum(rawlevels(X.HighCardFeature1)) - 1 end @@ -101,21 +101,21 @@ end :LowCardFeature => Dict( [ (level, enc_char(LowCardFeature_col, level)) for - level in levels(LowCardFeature_col) if + level in rawlevels(LowCardFeature_col) if proportionmap(LowCardFeature_col)[level] < 0.3 ], ), :HighCardFeature1 => Dict( [ (level, enc_num(HighCardFeature1_col, level)) for - level in levels(HighCardFeature1_col) if + level in rawlevels(HighCardFeature1_col) if proportionmap(HighCardFeature1_col)[level] < 0.3 ], ), :HighCardFeature2 => Dict( [ (level, enc_str(HighCardFeature2_col, level)) for - level in levels(HighCardFeature2_col) if + level in rawlevels(HighCardFeature2_col) if proportionmap(HighCardFeature2_col)[level] < 0.3 ], ), @@ -191,7 +191,7 @@ end ) X_tr = cardinality_reducer_transform(X, cache) - @test 'Z' in Set(levels(X_tr[:LowCardFeature])) + @test 'Z' in Set(rawlevels(X_tr[:LowCardFeature])) end @testset "MLJ Interface Cardinality Reducer" begin @@ -236,7 +236,7 @@ end :B => Multiclass, ) - levels(X.A) + rawlevels(X.A) encoder = CardinalityReducer(ordered_factor = false, min_frequency = 3) mach = fit!(machine(encoder, X)) diff --git a/test/transformers/other_transformers/univariate_discretizer.jl b/test/transformers/other_transformers/univariate_discretizer.jl index 0232518..775181b 100644 --- a/test/transformers/other_transformers/univariate_discretizer.jl +++ b/test/transformers/other_transformers/univariate_discretizer.jl @@ -1,4 +1,3 @@ - @testset "U-Discr" begin v = randn(10000) t = UnivariateDiscretizer(n_classes=100); @@ -24,5 +23,6 @@ v2 = v[1:3] w2 = MLJBase.transform(t, result, v2) @test levels(w2) == levels(w) +end -end \ No newline at end of file +true