diff --git a/Project.toml b/Project.toml index 27579db..2a4079d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJTransforms" uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6" authors = ["Essam and contributors"] -version = "0.1.0" +version = "0.1.1" [deps] BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf" @@ -14,6 +14,7 @@ MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" +ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87" @@ -22,14 +23,15 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] BitBasis = "0.9" CategoricalArrays = "0.10" -MLJModelInterface = "1.11" Combinatorics = "1" Dates = "1" Distributions = "0.25" LinearAlgebra = "1" +MLJModelInterface = "1.11" OrderedCollections = "1" Parameters = "0.12" -ScientificTypes = "3.0" +ScientificTypes = "3.1.0" +ScientificTypesBase = "3.0.0" Statistics = "1" StatsBase = "0.34" TableOperations = "1.2" @@ -38,11 +40,11 @@ julia = "1.10" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"] diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl index a957ca6..683a51a 100644 --- a/src/MLJTransforms.jl +++ b/src/MLJTransforms.jl @@ -1,9 +1,12 @@ module MLJTransforms using Tables -using ScientificTypes -using ScientificTypes: scitype +# Note: The `scitype` in +# MLJModelInterface clashes with the `scitype` in ScientificTypes. See also +# https://github.com/JuliaAI/MLJBase.jl/issues/1002 +import ScientificTypes: elscitype, schema, coerce, ScientificTimeType +using MLJModelInterface # exports `scitype`, which will call `ScientificTypes.scitype`, + # once MLJBase is loaded (but this is not a dependency!) using CategoricalArrays -using MLJModelInterface using TableOperations using StatsBase using LinearAlgebra @@ -15,7 +18,6 @@ using Parameters using Dates using OrderedCollections - const MMI = MLJModelInterface # Functions of generic use across transformers diff --git a/src/transformers/other_transformers/continuous_encoder.jl b/src/transformers/other_transformers/continuous_encoder.jl index 82dfdfb..7b232d5 100644 --- a/src/transformers/other_transformers/continuous_encoder.jl +++ b/src/transformers/other_transformers/continuous_encoder.jl @@ -73,7 +73,7 @@ end metadata_model(ContinuousEncoder, input_scitype = Table, output_scitype = Table(Continuous), - load_path = "MLJModels.ContinuousEncoder") + load_path = "MLJTransforms.ContinuousEncoder") """ $(MLJModelInterface.doc_header(ContinuousEncoder)) diff --git a/src/transformers/other_transformers/fill_imputer.jl b/src/transformers/other_transformers/fill_imputer.jl index adb6664..dfe7bfe 100644 --- a/src/transformers/other_transformers/fill_imputer.jl +++ b/src/transformers/other_transformers/fill_imputer.jl @@ -187,12 +187,12 @@ metadata_model(UnivariateFillImputer, AbstractVector{<:Count}, AbstractVector{<:Finite}}, human_name = "single variable fill imputer", - load_path = "MLJModels.UnivariateFillImputer") + load_path = "MLJTransforms.UnivariateFillImputer") metadata_model(FillImputer, input_scitype = Table, output_scitype = Table, - load_path = "MLJModels.FillImputer") + load_path = "MLJTransforms.FillImputer") """ $(MLJModelInterface.doc_header(UnivariateFillImputer)) diff --git a/src/transformers/other_transformers/interaction_transformer.jl b/src/transformers/other_transformers/interaction_transformer.jl index 4c2d35d..61f3acd 100644 --- a/src/transformers/other_transformers/interaction_transformer.jl +++ b/src/transformers/other_transformers/interaction_transformer.jl @@ -38,7 +38,7 @@ metadata_model(InteractionTransformer, input_scitype = Tuple{Table}, output_scitype = Table, human_name = "interaction transformer", - load_path = "MLJModels.InteractionTransformer") + load_path = "MLJTransforms.InteractionTransformer") """ $(MLJModelInterface.doc_header(InteractionTransformer)) diff --git a/src/transformers/other_transformers/metadata_shared.jl b/src/transformers/other_transformers/metadata_shared.jl index 72dc13a..91bbfd4 100644 --- a/src/transformers/other_transformers/metadata_shared.jl +++ b/src/transformers/other_transformers/metadata_shared.jl @@ -13,8 +13,8 @@ metadata_pkg.( UnivariateTimeTypeToContinuous, InteractionTransformer ), - package_name = "MLJModels", - package_uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7", - package_url = "https://github.com/JuliaAI/MLJModels.jl", + package_name = "MLJTransforms", + package_uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6", + package_url = "https://github.com/JuliaAI/MLJTransforms.jl", is_pure_julia = true, package_license = "MIT") \ No newline at end of file diff --git a/src/transformers/other_transformers/one_hot_encoder.jl b/src/transformers/other_transformers/one_hot_encoder.jl index 744bf96..bae5b75 100644 --- a/src/transformers/other_transformers/one_hot_encoder.jl +++ b/src/transformers/other_transformers/one_hot_encoder.jl @@ -163,7 +163,7 @@ metadata_model(OneHotEncoder, input_scitype = Table, output_scitype = Table, human_name = "one-hot encoder", - load_path = "MLJModels.OneHotEncoder") + load_path = "MLJTransforms.OneHotEncoder") """ $(MLJModelInterface.doc_header(OneHotEncoder)) diff --git a/src/transformers/other_transformers/standardizer.jl b/src/transformers/other_transformers/standardizer.jl index 8ce0c2b..e325096 100644 --- a/src/transformers/other_transformers/standardizer.jl +++ b/src/transformers/other_transformers/standardizer.jl @@ -229,12 +229,12 @@ metadata_model(UnivariateStandardizer, input_scitype = AbstractVector{<:Infinite}, output_scitype = AbstractVector{Continuous}, human_name = "single variable discretizer", - load_path = "MLJModels.UnivariateStandardizer") + load_path = "MLJTransforms.UnivariateStandardizer") metadata_model(Standardizer, input_scitype = Union{Table, AbstractVector{<:Continuous}}, output_scitype = Union{Table, AbstractVector{<:Continuous}}, - load_path = "MLJModels.Standardizer") + load_path = "MLJTransforms.Standardizer") """ $(MLJModelInterface.doc_header(Standardizer)) diff --git a/src/transformers/other_transformers/univariate_boxcox_transformer.jl b/src/transformers/other_transformers/univariate_boxcox_transformer.jl index b0db4ad..aba6250 100644 --- a/src/transformers/other_transformers/univariate_boxcox_transformer.jl +++ b/src/transformers/other_transformers/univariate_boxcox_transformer.jl @@ -91,7 +91,7 @@ metadata_model(UnivariateBoxCoxTransformer, input_scitype = AbstractVector{Continuous}, output_scitype = AbstractVector{Continuous}, human_name = "single variable Box-Cox transformer", - load_path = "MLJModels.UnivariateBoxCoxTransformer") + load_path = "MLJTransforms.UnivariateBoxCoxTransformer") """ $(MLJModelInterface.doc_header(UnivariateBoxCoxTransformer)) diff --git a/src/transformers/other_transformers/univariate_discretizer.jl b/src/transformers/other_transformers/univariate_discretizer.jl index 0c15b10..669d096 100644 --- a/src/transformers/other_transformers/univariate_discretizer.jl +++ b/src/transformers/other_transformers/univariate_discretizer.jl @@ -94,7 +94,7 @@ metadata_model(UnivariateDiscretizer, input_scitype = AbstractVector{<:Continuous}, output_scitype = AbstractVector{<:OrderedFactor}, human_name = "single variable discretizer", - load_path = "MLJModels.UnivariateDiscretizer") + load_path = "MLJTransforms.UnivariateDiscretizer") """ diff --git a/src/transformers/other_transformers/univariate_time_type_to_continuous.jl b/src/transformers/other_transformers/univariate_time_type_to_continuous.jl index d9b5b09..3fb71f2 100644 --- a/src/transformers/other_transformers/univariate_time_type_to_continuous.jl +++ b/src/transformers/other_transformers/univariate_time_type_to_continuous.jl @@ -129,7 +129,7 @@ metadata_model(UnivariateTimeTypeToContinuous, output_scitype = AbstractVector{Continuous}, human_name ="single variable transformer that creates "* "continuous representations of temporally typed data", - load_path = "MLJModels.UnivariateTimeTypeToContinuous") + load_path = "MLJTransforms.UnivariateTimeTypeToContinuous") """ $(MLJModelInterface.doc_header(UnivariateTimeTypeToContinuous)) diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl index 0e252bd..c6052ff 100644 --- a/test/encoders/contrast_encoder.jl +++ b/test/encoders/contrast_encoder.jl @@ -195,8 +195,8 @@ end df = DataFrame(X) - mf = ModelFrame( - @formula(age ~ (name + height + favnum)), + mf = StatsModels.ModelFrame( + StatsModels.@formula(age ~ (name + height + favnum)), df, contrasts = Dict( :name => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 3)), @@ -204,7 +204,7 @@ end ), ) - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj == X_tr_sm end @@ -221,16 +221,16 @@ end X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] df = DataFrame(X) - mf = ModelFrame( - @formula(age ~ (name + height + favnum)), + mf = StatsModels.ModelFrame( + StatsModels.@formula(age ~ (name + height + favnum)), df, contrasts = Dict( - :name => HypothesisCoding( + :name => StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, 3); levels = levels(X.name), labels = [], ), - :favnum => HypothesisCoding( + :favnum => StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, 4); levels = levels(X.favnum), labels = [], @@ -238,7 +238,7 @@ end ), ) - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj == X_tr_sm end @@ -257,11 +257,11 @@ end for ind in 1:6 stats_models(k, ind) = [ StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), - DummyCoding(; base = (k == 3) ? "Mary" : 10), - EffectsCoding(; base = (k == 3) ? "Mary" : 10), - SeqDiffCoding(), - HelmertCoding(), - HypothesisCoding( + StatsModels.DummyCoding(; base = (k == 3) ? "Mary" : 10), + StatsModels.EffectsCoding(; base = (k == 3) ? "Mary" : 10), + StatsModels.SeqDiffCoding(), + StatsModels.HelmertCoding(), + StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, k); levels = (k == 3) ? levels(X.name) : levels(X.favnum), labels = [], @@ -277,8 +277,8 @@ end df = DataFrame(X) - mf = ModelFrame( - @formula(age ~ (name + height + favnum)), + mf = StatsModels.ModelFrame( + StatsModels.@formula(age ~ (name + height + favnum)), df, contrasts = Dict( :name => stats_models(3, ind), @@ -287,7 +287,7 @@ end ) X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj ≈ X_tr_sm end end @@ -298,11 +298,11 @@ end for ind2 in 2:5 stats_models(k, ind) = [ StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)), - DummyCoding(; base = (k == 3) ? "Mary" : 10), - EffectsCoding(; base = (k == 3) ? "Mary" : 10), - SeqDiffCoding(), - HelmertCoding(), - HypothesisCoding( + StatsModels.DummyCoding(; base = (k == 3) ? "Mary" : 10), + StatsModels.EffectsCoding(; base = (k == 3) ? "Mary" : 10), + StatsModels.SeqDiffCoding(), + StatsModels.HelmertCoding(), + StatsModels.HypothesisCoding( buildrandomhypothesis(nothing, k); levels = (k == 3) ? levels(X.name) : levels(X.favnum), labels = [], @@ -331,8 +331,8 @@ end df = DataFrame(X) - mf = ModelFrame( - @formula(age ~ (name + height + favnum)), + mf = StatsModels.ModelFrame( + StatsModels.@formula(age ~ (name + height + favnum)), df, contrasts = Dict( :name => stats_models(3, ind1), @@ -341,7 +341,7 @@ end ) X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1] - X_tr_sm = ModelMatrix(mf).m[:, 2:end] + X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end] @test X_tr_mlj ≈ X_tr_sm end @@ -358,7 +358,7 @@ end encoder = ContrastEncoder(ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -392,7 +392,7 @@ end buildmatrix = matrix_func[i], ) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) # Test Consistency with Types scs = schema(Xnew).scitypes @@ -406,4 +406,4 @@ end @test last_type <: Integer && isconcretetype(last_type) @test last_sctype <: Count end -end \ No newline at end of file +end diff --git a/test/encoders/frequency_encoder.jl b/test/encoders/frequency_encoder.jl index e08eefb..3836836 100644 --- a/test/encoders/frequency_encoder.jl +++ b/test/encoders/frequency_encoder.jl @@ -5,7 +5,7 @@ using MLJTransforms: frequency_encoder_fit, frequency_encoder_transform X = dataset_forms[1] normalize = [false, true] - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) for norm in normalize result = frequency_encoder_fit(X; normalize = norm)[:statistic_given_feat_val] enc = @@ -72,7 +72,7 @@ end encoder = FrequencyEncoder(ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -111,7 +111,7 @@ end encoder = FrequencyEncoder(ordered_factor = false, normalize = false) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) scs = schema(Xnew).scitypes diff --git a/test/encoders/missingness_encoding.jl b/test/encoders/missingness_encoding.jl index ed9cf43..201ebf8 100644 --- a/test/encoders/missingness_encoding.jl +++ b/test/encoders/missingness_encoding.jl @@ -170,7 +170,7 @@ end encoder = MissingnessEncoder(ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test isequal(X_transf, Xnew_transf) @@ -197,7 +197,7 @@ end encoder = MissingnessEncoder() mach = fit!(machine(encoder, Xm)) - Xnew = MMI.transform(mach, Xm) + Xnew = MLJBase.transform(mach, Xm) schema(Xm) schema(Xnew) diff --git a/test/encoders/ordinal_encoding.jl b/test/encoders/ordinal_encoding.jl index 4af6541..314aa4b 100644 --- a/test/encoders/ordinal_encoding.jl +++ b/test/encoders/ordinal_encoding.jl @@ -15,7 +15,7 @@ push!( @test ordinal_encoder_fit(dataset_forms[1]) == ordinal_encoder_fit(dataset_forms[2]) X = dataset_forms[1] result = ordinal_encoder_fit(X)[:index_given_feat_level] - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) true_output = Dict{Symbol, Dict{Any, AbstractFloat}}( :F => Dict( "m" => findfirst(==("m"), levels(F_col)), @@ -70,7 +70,7 @@ end encoder = OrdinalEncoder(ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -108,7 +108,7 @@ end encoder = OrdinalEncoder(ordered_factor = false) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) scs = schema(Xnew).scitypes ts = schema(Xnew).types @@ -123,7 +123,7 @@ end ## Int32 case encoder = OrdinalEncoder(ordered_factor = false, output_type = Int32) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) scs = schema(Xnew).scitypes ts = schema(Xnew).types # Check scitypes for previously categorical features diff --git a/test/encoders/target_encoding.jl b/test/encoders/target_encoding.jl index 83d167d..9740afd 100644 --- a/test/encoders/target_encoding.jl +++ b/test/encoders/target_encoding.jl @@ -63,7 +63,7 @@ end X, y = classification_forms[1] n = length(y) - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) true_output = Dict{Symbol, Dict{Any, AbstractFloat}}( :F => Dict( "m" => sum(y[F_col.=="m"] .== 0) / length(y[F_col.=="m"]), @@ -119,7 +119,7 @@ end n = length(y) μ̂ = mean(y) - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) true_output = Dict{Symbol, Dict{Any, AbstractFloat}}( :F => Dict( "m" => mean(y[F_col.=="m"]), @@ -172,7 +172,7 @@ end y_classes = classes(y) n = length(y) - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) true_output = Dict{Symbol, Dict{Any, AbstractVector{AbstractFloat}}}( :F => Dict( "m" => @@ -320,7 +320,7 @@ end TargetEncoder(ignore = true, ordered_factor = false, lambda = 0.5, m = 1.0) mach = machine(encoder, X, y) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -368,7 +368,7 @@ end D = [true, false, true, false, true] E = [1, 2, 3, 4, 5] - # Define the target variable + # Define the target variable y = ["c1", "c2", "c3", "c1", "c2"] # Combine into a named tuple @@ -386,7 +386,7 @@ end encoder = TargetEncoder(ordered_factor = false, lambda = 1.0, m = 0) mach = fit!(machine(encoder, X, y)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) scs = schema(Xnew).scitypes ts = schema(Xnew).types @@ -396,4 +396,3 @@ end @test scs[end] === schema(X).scitypes[end] @test ts[end] == schema(X).types[end] end - diff --git a/test/generic.jl b/test/generic.jl index 6260d94..6842ce4 100644 --- a/test/generic.jl +++ b/test/generic.jl @@ -142,7 +142,7 @@ end @testset "Test generic fit output" begin X = dataset_forms[1] - A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6]) + A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6]) result = dummy_encoder_fit(X)[:hash_given_feat_val] enc = (col, level) -> (hash(level)) true_output = Dict{Symbol, Dict{Any, Any}}( diff --git a/test/runtests.jl b/test/runtests.jl index d8b0f5a..83c593a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,21 +1,16 @@ using MLJTransforms using Test using DataFrames -using ScientificTypes using CategoricalArrays -using MLJModelInterface using MLJBase using StatsBase using LinearAlgebra -using StatsModels +import StatsModels using Random -const MMI = MLJModelInterface using LinearAlgebra -using StatsModels # Other transformers using Tables, CategoricalArrays -using ScientificTypes: scitype, schema using Statistics using StableRNGs stable_rng = StableRNGs.StableRNG(123) @@ -40,4 +35,4 @@ include("transformers/other_transformers/interaction_transformer.jl") include("transformers/other_transformers/continuous_encoder.jl") include("transformers/other_transformers/univariate_boxcox_transformer.jl") include("transformers/other_transformers/standardizer.jl") -include("transformers/other_transformers/univariate_discretizer.jl") \ No newline at end of file +include("transformers/other_transformers/univariate_discretizer.jl") diff --git a/test/transformers/cardinality_reducer.jl b/test/transformers/cardinality_reducer.jl index dbab08b..79386e5 100644 --- a/test/transformers/cardinality_reducer.jl +++ b/test/transformers/cardinality_reducer.jl @@ -208,7 +208,7 @@ end encoder = CardinalityReducer(min_frequency = 0.1, ignore = true, ordered_factor = false) mach = machine(encoder, X) fit!(mach) - Xnew_transf = MMI.transform(mach, X) + Xnew_transf = MLJBase.transform(mach, X) # same output @test X_transf == Xnew_transf @@ -240,11 +240,11 @@ end encoder = CardinalityReducer(ordered_factor = false, min_frequency = 3) mach = fit!(machine(encoder, X)) - Xnew = MMI.transform(mach, X) + Xnew = MLJBase.transform(mach, X) @test schema(X).types == schema(Xnew).types @test all(s -> (s <: Multiclass), schema(Xnew).scitypes) end # Look into MLJModelInterfaceTest -# Add tests to ensure categorical feature properties are as expected \ No newline at end of file +# Add tests to ensure categorical feature properties are as expected