diff --git a/Project.toml b/Project.toml index 27579db..c2a7f89 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" +ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87" @@ -22,14 +23,15 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] BitBasis = "0.9" CategoricalArrays = "0.10" -MLJModelInterface = "1.11" Combinatorics = "1" Dates = "1" Distributions = "0.25" LinearAlgebra = "1" +MLJModelInterface = "1.11" OrderedCollections = "1" Parameters = "0.12" -ScientificTypes = "3.0" +ScientificTypes = "3.1.0" +ScientificTypesBase = "3.0.0" Statistics = "1" StatsBase = "0.34" TableOperations = "1.2" @@ -38,11 +40,11 @@ julia = "1.10" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"] diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl index a957ca6..683a51a 100644 --- a/src/MLJTransforms.jl +++ b/src/MLJTransforms.jl @@ -1,9 +1,12 @@ module MLJTransforms using Tables -using ScientificTypes -using ScientificTypes: scitype +# Note: The `scitype` in +# MLJModelInterface clashes with the `scitype` in ScientificTypes. See also +# https://github.com/JuliaAI/MLJBase.jl/issues/1002 +import ScientificTypes: elscitype, schema, coerce, ScientificTimeType +using MLJModelInterface # exports `scitype`, which will call `ScientificTypes.scitype`, + # once MLJBase is loaded (but this is not a dependency!) using CategoricalArrays -using MLJModelInterface using TableOperations using StatsBase using LinearAlgebra @@ -15,7 +18,6 @@ using Parameters using Dates using OrderedCollections - const MMI = MLJModelInterface # Functions of generic use across transformers diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl index 0e252bd..c6052ff 100644 --- a/test/encoders/contrast_encoder.jl +++ b/test/encoders/contrast_encoder.jl @@ -195,8 +195,8 @@ end df = DataFrame(X) - mf = ModelFrame( - @formula(age ~ (name + height + favnum)), + mf = StatsModels.ModelFrame( + StatsModels.@formula(age ~ (name + height + favnum)), df, contrasts = Dict( :name => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 3)), @@ -204,7 +204,7 @@ end ), ) - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj == X_tr_sm end @@ -221,16 +221,16 @@ end X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] df = DataFrame(X) - mf = ModelFrame( - @formula(age ~ (name + height + favnum)), + mf = StatsModels.ModelFrame( + StatsModels.@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => HypothesisCoding( + :name => StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, 3); levels = levels(X.name), labels = [], ), - :favnum => HypothesisCoding( + :favnum => StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, 4); levels = levels(X.favnum), labels = [], @@ -238,7 +238,7 @@ end ), ) - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj == X_tr_sm end @@ -257,11 +257,11 @@ end for ind in 1:6 stats_models(k, ind) = [ StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), - DummyCoding(; base = (k == 3) ? "Mary" : 10), - EffectsCoding(; base = (k == 3) ? "Mary" : 10), - SeqDiffCoding(), - HelmertCoding(), - HypothesisCoding( + StatsModels.DummyCoding(; base = (k == 3) ? "Mary" : 10), + StatsModels.EffectsCoding(; base = (k == 3) ? "Mary" : 10), + StatsModels.SeqDiffCoding(), + StatsModels.HelmertCoding(), + StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, k); levels = (k == 3) ? levels(X.name) : levels(X.favnum), labels = [], @@ -277,8 +277,8 @@ end df = DataFrame(X) - mf = ModelFrame( - @formula(age ~ (name + height + favnum)), + mf = StatsModels.ModelFrame( + StatsModels.@formula(age ~ (name + height + favnum)), df, contrasts = Dict( :name => stats_models(3, ind), @@ -287,7 +287,7 @@ end ) X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj ≈ X_tr_sm end end @@ -298,11 +298,11 @@ end for ind2 in 2:5 stats_models(k, ind) = [ StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), - DummyCoding(; base = (k == 3) ? "Mary" : 10), - EffectsCoding(; base = (k == 3) ? "Mary" : 10), - SeqDiffCoding(), - HelmertCoding(), - HypothesisCoding( + StatsModels.DummyCoding(; base = (k == 3) ? "Mary" : 10), + StatsModels.EffectsCoding(; base = (k == 3) ? "Mary" : 10), + StatsModels.SeqDiffCoding(), + StatsModels.HelmertCoding(), + StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, k); levels = (k == 3) ? levels(X.name) : levels(X.favnum), labels = [], @@ -331,8 +331,8 @@ end df = DataFrame(X) - mf = ModelFrame( - @formula(age ~ (name + height + favnum)), + mf = StatsModels.ModelFrame( + StatsModels.@formula(age ~ (name + height + favnum)), df, contrasts = Dict( :name => stats_models(3, ind1), @@ -341,7 +341,7 @@ end ) X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj ≈ X_tr_sm end @@ -358,7 +358,7 @@ end encoder = ContrastEncoder(ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -392,7 +392,7 @@ end buildmatrix = matrix_func[i], ) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) # Test Consistency with Types scs = schema(Xnew).scitypes @@ -406,4 +406,4 @@ end @test last_type <: Integer && isconcretetype(last_type) @test last_sctype <: Count end -end \ No newline at end of file +end diff --git a/test/encoders/frequency_encoder.jl b/test/encoders/frequency_encoder.jl index e08eefb..3836836 100644 --- a/test/encoders/frequency_encoder.jl +++ b/test/encoders/frequency_encoder.jl @@ -5,7 +5,7 @@ using MLJTransforms: frequency_encoder_fit, frequency_encoder_transform X = dataset_forms[1] normalize = [false, true] - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) for norm in normalize result = frequency_encoder_fit(X; normalize = norm)[:statistic_given_feat_val] enc = @@ -72,7 +72,7 @@ end encoder = FrequencyEncoder(ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -111,7 +111,7 @@ end encoder = FrequencyEncoder(ordered_factor = false, normalize = false) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) scs = schema(Xnew).scitypes diff --git a/test/encoders/missingness_encoding.jl b/test/encoders/missingness_encoding.jl index ed9cf43..201ebf8 100644 --- a/test/encoders/missingness_encoding.jl +++ b/test/encoders/missingness_encoding.jl @@ -170,7 +170,7 @@ end encoder = MissingnessEncoder(ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test isequal(X_transf, Xnew_transf) @@ -197,7 +197,7 @@ end encoder = MissingnessEncoder() mach = fit!(machine(encoder, Xm)) - Xnew = MMI.transform(mach, Xm) + Xnew = MLJBase.transform(mach, Xm) schema(Xm) schema(Xnew) diff --git a/test/encoders/ordinal_encoding.jl b/test/encoders/ordinal_encoding.jl index 4af6541..314aa4b 100644 --- a/test/encoders/ordinal_encoding.jl +++ b/test/encoders/ordinal_encoding.jl @@ -15,7 +15,7 @@ push!( @test ordinal_encoder_fit(dataset_forms[1]) == ordinal_encoder_fit(dataset_forms[2]) X = dataset_forms[1] result = ordinal_encoder_fit(X)[:index_given_feat_level] - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) true_output = Dict{Symbol, Dict{Any, AbstractFloat}}( :F => Dict( "m" => findfirst(==("m"), levels(F_col)), @@ -70,7 +70,7 @@ end encoder = OrdinalEncoder(ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -108,7 +108,7 @@ end encoder = OrdinalEncoder(ordered_factor = false) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) scs = schema(Xnew).scitypes ts = schema(Xnew).types @@ -123,7 +123,7 @@ end ## Int32 case encoder = OrdinalEncoder(ordered_factor = false, output_type = Int32) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) scs = schema(Xnew).scitypes ts = schema(Xnew).types # Check scitypes for previously categorical features diff --git a/test/encoders/target_encoding.jl b/test/encoders/target_encoding.jl index 83d167d..9740afd 100644 --- a/test/encoders/target_encoding.jl +++ b/test/encoders/target_encoding.jl @@ -63,7 +63,7 @@ end X, y = classification_forms[1] n = length(y) - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) true_output = Dict{Symbol, Dict{Any, AbstractFloat}}( :F => Dict( "m" => sum(y[F_col.=="m"] .== 0) / length(y[F_col.=="m"]), @@ -119,7 +119,7 @@ end n = length(y) μ̂ = mean(y) - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) true_output = Dict{Symbol, Dict{Any, AbstractFloat}}( :F => Dict( "m" => mean(y[F_col.=="m"]), @@ -172,7 +172,7 @@ end y_classes = classes(y) n = length(y) - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) true_output = Dict{Symbol, Dict{Any, AbstractVector{AbstractFloat}}}( :F => Dict( "m" => @@ -320,7 +320,7 @@ end TargetEncoder(ignore = true, ordered_factor = false, lambda = 0.5, m = 1.0) mach = machine(encoder, X, y) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -368,7 +368,7 @@ end D = [true, false, true, false, true] E = [1, 2, 3, 4, 5] - # Define the target variable + # Define the target variable y = ["c1", "c2", "c3", "c1", "c2"] # Combine into a named tuple @@ -386,7 +386,7 @@ end encoder = TargetEncoder(ordered_factor = false, lambda = 1.0, m = 0) mach = fit!(machine(encoder, X, y)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) scs = schema(Xnew).scitypes ts = schema(Xnew).types @@ -396,4 +396,3 @@ end @test scs[end] === schema(X).scitypes[end] @test ts[end] == schema(X).types[end] end - diff --git a/test/generic.jl b/test/generic.jl index 6260d94..6842ce4 100644 --- a/test/generic.jl +++ b/test/generic.jl @@ -142,7 +142,7 @@ end @testset "Test generic fit output" begin X = dataset_forms[1] - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) result = dummy_encoder_fit(X)[:hash_given_feat_val] enc = (col, level) -> (hash(level)) true_output = Dict{Symbol, Dict{Any, Any}}( diff --git a/test/runtests.jl b/test/runtests.jl index d8b0f5a..83c593a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,21 +1,16 @@ using MLJTransforms using Test using DataFrames -using ScientificTypes using CategoricalArrays -using MLJModelInterface using MLJBase using StatsBase using LinearAlgebra -using StatsModels +import StatsModels using Random -const MMI = MLJModelInterface using LinearAlgebra -using StatsModels # Other transformers using Tables, CategoricalArrays -using ScientificTypes: scitype, schema using Statistics using StableRNGs stable_rng = StableRNGs.StableRNG(123) @@ -40,4 +35,4 @@ include("transformers/other_transformers/interaction_transformer.jl") include("transformers/other_transformers/continuous_encoder.jl") include("transformers/other_transformers/univariate_boxcox_transformer.jl") include("transformers/other_transformers/standardizer.jl") -include("transformers/other_transformers/univariate_discretizer.jl") \ No newline at end of file +include("transformers/other_transformers/univariate_discretizer.jl") diff --git a/test/transformers/cardinality_reducer.jl b/test/transformers/cardinality_reducer.jl index dbab08b..79386e5 100644 --- a/test/transformers/cardinality_reducer.jl +++ b/test/transformers/cardinality_reducer.jl @@ -208,7 +208,7 @@ end encoder = CardinalityReducer(min_frequency = 0.1, ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -240,11 +240,11 @@ end encoder = CardinalityReducer(ordered_factor = false, min_frequency = 3) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) @test schema(X).types == schema(Xnew).types @test all(s -> (s <: Multiclass), schema(Xnew).scitypes) end # Look into MLJModelInterfaceTest -# Add tests to ensure categorical feature properties are as expected \ No newline at end of file +# Add tests to ensure categorical feature properties are as expected