diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 55d4ce2..75979e0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,7 +43,7 @@ jobs: - uses: julia-actions/julia-runtest@v1 env: # This environment variable enables the integration tests: - MLJ_TEST_REGISTRY: '1' + MLJ_TEST_REGISTRY: "false" - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v4 with: diff --git a/Project.toml b/Project.toml index 5c53aaa..315d615 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJModels" uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7" authors = ["Anthony D. Blaom "] -version = "0.18.1" +version = "0.18.2" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" @@ -28,8 +28,8 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -CategoricalArrays = "0.9, 0.10" -CategoricalDistributions = "0.1" +CategoricalArrays = "1" +CategoricalDistributions = "0.2" Combinatorics = "1.0" Dates = "1" Distances = "0.9,0.10" diff --git a/src/MLJModels.jl b/src/MLJModels.jl index 3f85e81..c727d6f 100755 --- a/src/MLJModels.jl +++ b/src/MLJModels.jl @@ -22,8 +22,7 @@ using Combinatorics import Distributions import REPL # stdlib, needed for `Term` import PrettyPrinting -import CategoricalDistributions: UnivariateFinite, UnivariateFiniteArray, - classes +import CategoricalDistributions: UnivariateFinite, UnivariateFiniteArray import StatisticalTraits # for `info` # from loading.jl: diff --git a/src/builtins/Constant.jl b/src/builtins/Constant.jl index 886d164..e546307 100644 --- a/src/builtins/Constant.jl +++ b/src/builtins/Constant.jl @@ -55,7 +55,7 @@ function MLJModelInterface.fit(::ConstantClassifier, y, w=nothing) d = Distributions.fit(UnivariateFinite, y, w) - C = classes(d) + C = levels(d) fitresult = (C, Distributions.pdf([d, ], C)) cache = nothing report = NamedTuple() @@ -66,10 +66,10 @@ MLJModelInterface.fitted_params(::ConstantClassifier, fitresult) = (target_distribution=fitresult,) function MLJModelInterface.predict(::ConstantClassifier, fitresult, Xnew) - _classes, probs1 = fitresult + _levels, probs1 = fitresult N = nrows(Xnew) - probs = reshape(vcat(fill(probs1, N)...), N, length(_classes)) - return UnivariateFinite(_classes, probs) + probs = reshape(vcat(fill(probs1, N)...), N, length(_levels)) + return UnivariateFinite(_levels, probs) end @@ -216,10 +216,11 @@ ConstantRegressor This "dummy" probabilistic predictor always returns the same distribution, irrespective of the provided input pattern. The distribution `d` returned is the `UnivariateFinite` -distribution based on frequency of classes observed in the training target data. So, -`pdf(d, level)` is the number of times the training target takes on the value `level`. -Use `predict_mode` instead of `predict` to obtain the training target mode instead. For -more on the `UnivariateFinite` type, see the CategoricalDistributions.jl package. +distribution based on frequency of levels (classes) observed in the training target +data. So, `pdf(d, level)` is the number of times the training target takes on the value +`level`. Use `predict_mode` instead of `predict` to obtain the training target mode +instead. For more on the `UnivariateFinite` type, see the CategoricalDistributions.jl +package. Almost any reasonable model is expected to outperform `ConstantClassifier`, which is used almost exclusively for testing and establishing performance baselines. diff --git a/src/builtins/ThresholdPredictors.jl b/src/builtins/ThresholdPredictors.jl index 18e6da9..9b06f8a 100644 --- a/src/builtins/ThresholdPredictors.jl +++ b/src/builtins/ThresholdPredictors.jl @@ -56,16 +56,16 @@ const ThresholdSupported = Union{keys(_type_given_atom)...} const ERR_MODEL_UNSPECIFIED = ArgumentError( "Expecting atomic model as argument. None specified. ") -warn_classes(first_class, second_class) = +warn_levels(first_class, second_class) = "Taking positive class as `$(second_class)` and negative class as"* "`$(first_class)`."* "Coerce target to `OrderedFactor{2}` to suppress this warning, "* "ensuring that positive class > negative class. " -const ERR_CLASSES_DETECTOR = ArgumentError( +const ERR_LEVELS_DETECTOR = ArgumentError( "Targets for detector models must be ordered. Consider coercing to "* "`OrderedFactor`, ensuring that outlier class > inlier class. ") const ERR_TARGET_NOT_BINARY = ArgumentError( - "Target `y` must have two classes in its pool, even if only one "* + "Target `y` must have two levels in its pool, even if only one "* "class is manifest. ") const err_unsupported_model_type(T) = ArgumentError( "`BinaryThresholdPredictor` does not support atomic models with supertype `$T`. "* @@ -208,9 +208,9 @@ function MMI.fit(model::ThresholdUnion, verbosity::Int, args...) length(L) == 2 || throw(ERR_TARGET_NOT_BINARY) first_class, second_class = L if model.model isa Probabilistic - @warn warn_classes(first_class, second_class) + @warn warn_levels(first_class, second_class) else - throw(ERR_CLASSES_DETECTOR) + throw(ERR_LEVELS_DETECTOR) end end model_fitresult, model_cache, model_report = MMI.fit( @@ -259,7 +259,7 @@ function _predict_threshold(yhat::UnivariateFinite, threshold) dict = yhat.prob_given_ref length(threshold) == length(dict) || throw( ArgumentError( - "`length(threshold)` has to equal number of classes in specified "* + "`length(threshold)` has to equal number of levels in specified "* "`UnivariateFinite` distribution." ) ) @@ -277,14 +277,14 @@ function _predict_threshold(yhat::UnivariateFiniteArray{S,V,R,P,N}, dict = yhat.prob_given_ref length(threshold) == length(dict) || throw( ArgumentError( - "`length(threshold)` has to equal number of classes in specified "* + "`length(threshold)` has to equal number of levels in specified "* "`UnivariateFiniteArray`." ) ) d = yhat.decoder(1) levs = levels(d) ord = isordered(d) - # Array to house the predicted classes + # Array to house the predicted levels ret = CategoricalArray{V, N, R}(undef, size(yhat), levels=levs, ordered=ord) #ret = Array{CategoricalValue{V, R}, N}(undef, size(yhat)) # `temp` vector allocted once to be used for calculations in each loop diff --git a/test/builtins/Constant.jl b/test/builtins/Constant.jl index d32f306..2636abd 100644 --- a/test/builtins/Constant.jl +++ b/test/builtins/Constant.jl @@ -35,14 +35,14 @@ end d = MLJBase.UnivariateFinite([y[1], y[2], y[4]], [0.5, 0.25, 0.25]) yhat = MLJBase.predict_mode(model, fitresult, X) - @test MLJBase.classes(yhat[1]) == MLJBase.classes(y[1]) + @test levels(yhat[1]) == levels(y[1]) @test yhat[5] == y[1] @test length(yhat) == 10 yhat = MLJBase.predict(model, fitresult, X) yhat1 = yhat[1] - for c in MLJBase.classes(d) + for c in levels(d) Distributions.pdf(yhat1, c) ≈ Distributions.pdf(d, c) end diff --git a/test/builtins/ThresholdPredictors.jl b/test/builtins/ThresholdPredictors.jl index 30b0cc7..f8525d6 100644 --- a/test/builtins/ThresholdPredictors.jl +++ b/test/builtins/ThresholdPredictors.jl @@ -27,9 +27,9 @@ y2_ = categorical(yraw[2:end], ordered=true) ) # Check warning when `y` is not ordered: - @test_logs((:warn, MLJModels.warn_classes(levels(y_)...)), + @test_logs((:warn, MLJModels.warn_levels(levels(y_)...)), MMI.fit(model, 1, MMI.reformat(model, X_, y1_)...)) - # Check predictions containing two classes + # Check predictions containing two levels @test_throws ArgumentError BinaryThresholdPredictor(ConstantRegressor()) @test_logs((:warn, r"`threshold` should be"), BinaryThresholdPredictor(atom, threshold=-1)) @@ -88,13 +88,13 @@ end v1 = categorical(['a', 'b', 'a']) v2 = categorical(['a', 'b', 'a', 'c']) # Test with UnivariateFinite object - d1 = UnivariateFinite(MMI.classes(v1), [0.4, 0.6]) + d1 = UnivariateFinite(levels(v1), [0.4, 0.6]) @test_throws ArgumentError MLJModels._predict_threshold(d1, 0.7) @test MLJModels._predict_threshold(d1, (0.7, 0.3)) == v1[2] @test MLJModels._predict_threshold(d1, [0.5, 0.5]) == v1[2] @test MLJModels._predict_threshold(d1, (0.4, 0.6)) == v1[1] @test MLJModels._predict_threshold(d1, [0.2, 0.8]) == v1[1] - d2 = UnivariateFinite(MMI.classes(v2), [0.4, 0.3, 0.3]) + d2 = UnivariateFinite(levels(v2), [0.4, 0.3, 0.3]) @test_throws ArgumentError MLJModels._predict_threshold(d2, (0.7, 0.3)) @test MLJModels._predict_threshold(d2, (0.2, 0.5, 0.3)) == v2[1] @test MLJModels._predict_threshold(d2, [0.3, 0.2, 0.5]) == v2[2] @@ -117,14 +117,14 @@ end # Test with UnivariateFiniteArray oject probs1 = [0.2 0.8; 0.7 0.3; 0.1 0.9] - unf_arr1 = UnivariateFinite(MMI.classes(v1), probs1) + unf_arr1 = UnivariateFinite(levels(v1), probs1) @test_throws ArgumentError MLJModels._predict_threshold(unf_arr1, 0.7) @test MLJModels._predict_threshold(unf_arr1, (0.7, 0.3)) == [v1[2], v1[1], v1[2]] @test MLJModels._predict_threshold(unf_arr1, [0.5, 0.5]) == [v1[2], v1[1], v1[2]] @test MLJModels._predict_threshold(unf_arr1, (0.4, 0.6)) == [v1[2], v1[1], v1[2]] @test MLJModels._predict_threshold(unf_arr1, [0.2, 0.8]) == [v1[1], v1[1], v1[2]] probs2 = [0.2 0.3 0.5;0.1 0.6 0.3; 0.4 0.0 0.6] - unf_arr2 = UnivariateFinite(MMI.classes(v2), probs2) + unf_arr2 = UnivariateFinite(levels(v2), probs2) @test_throws ArgumentError MLJModels._predict_threshold(unf_arr2, (0.7, 0.3)) @test MLJModels._predict_threshold(unf_arr2, (0.2, 0.5, 0.3)) == [v2[4], v2[2], v2[1]] @test MLJModels._predict_threshold(unf_arr2, [0.3, 0.2, 0.5]) == [v2[2], v2[2], v2[1]] @@ -144,7 +144,7 @@ MMI.input_scitype(::Type{<:DummyDetector}) = MMI.Table @testset "BinaryThresholdPredictor - ProbabilisticUnsupervisedDetector" begin detector = BinaryThresholdPredictor(DummyDetector(), threshold=0.2) - @test_throws MLJModels.ERR_CLASSES_DETECTOR MMI.fit( + @test_throws MLJModels.ERR_LEVELS_DETECTOR MMI.fit( detector, 1, MMI.reformat(detector, X_, y1_)... ) diff --git a/test/testutils.jl b/test/testutils.jl deleted file mode 100644 index 5e1ce22..0000000 --- a/test/testutils.jl +++ /dev/null @@ -1,115 +0,0 @@ -using Random, MLJBase - -function gen_reg(; n=100, p=5, seed=143) - Random.seed!(143) - X = randn(n, p) - y = randn(n) - return MLJBase.table(X), y -end - -function gen_classif(; n=100, p=5, seed=145, classes=["A", "B"]) - Random.seed!(seed) - X = randn(n, p) - # gen [1, 2, 1, 3,..] - M = exp.(randn(n, length(classes))) - Mn = M ./ sum(M, dims=2) - z = multi_rand(Mn) - # associate labels - y = [classes[zᵢ] for zᵢ in z] - return MLJBase.table(X), MLJBase.categorical(y) -end - -function gen_dummy_classif_binary(; n=50, p=5, seed=1566) - # create clouds of points that are super separated - cloud1 = randn(n, p) .+ 5.0 - cloud2 = randn(n, p) .- 5.0 - test1 = randn(10, p) .+ 5.0 - test2 = randn(10, p) .- 5.0 - X = MLJBase.table(vcat(cloud1, cloud2)) - Xt = MLJBase.table(vcat(test1, test2)) - y1 = fill("A", n) - y2 = fill("B", n) - yt1 = fill("A", 10) - yt2 = fill("B", 10) - y = MLJBase.categorical(vcat(y1, y2)) - yt = MLJBase.categorical(vcat(yt1, yt2)) - return X, Xt, y, yt -end - -function gen_dummy_classif(; n=50, p=5, seed=1566) - # create clouds of points that are super separated - cloud1 = randn(n, p) .+ 5.0 - cloud2 = randn(n, p) - cloud3 = randn(n, p) .- 5.0 - test1 = randn(10, p) .+ 5.0 - test2 = randn(10, p) - test3 = randn(10, p) .- 5.0 - X = MLJBase.table(vcat(cloud1, cloud2, cloud3)) - Xt = MLJBase.table(vcat(test1, test2, test3)) - y1 = fill("A", n) - y2 = fill("B", n) - y3 = fill("C", n) - yt1 = fill("A", 10) - yt2 = fill("B", 10) - yt3 = fill("C", 10) - y = MLJBase.categorical(vcat(y1, y2, y3)) - yt = MLJBase.categorical(vcat(yt1, yt2, yt3)) - return X, Xt, y, yt -end - -# simple function to sample multinomial -function multi_rand(Mp) - n, c = size(Mp) - be = reshape(rand(length(Mp)), n, c) - y = zeros(Int, n) - @inbounds for i in eachindex(y) - rp = 1.0 - for k in 1:c-1 - if (be[i, k] < Mp[i, k] / rp) - y[i] = k - break - end - rp -= Mp[i, k] - end - end - y[y .== 0] .= c - return y -end - -function simple_test_reg(m, X, y) - f, _, _ = fit(m, 1, X, y) - p = predict(m, f, X) - @test norm(p .- y) / norm(y) < 1 - m, f -end - -function test_dummy_classif(m; seed=5154, binary=false, thresh=0.75) - if binary - X, Xt, y, yt = gen_dummy_classif_binary(seed=seed) - else - X, Xt, y, yt = gen_dummy_classif(seed=seed) - end - f, _, _ = fit(m, 1, X, y) - p = typeof(m) <: Probabilistic ? predict_mode(m, f, Xt) : predict(m, f, Xt) - @test sum(p .== yt) / length(yt) ≥ thresh -end - -function simple_test_classif(m, X, y; dummybinary=false, nodummy=false, thresh=0.75) - f, _, _ = fit(m, 1, X, y) - p = predict(m, f, X) - @test eltype(p) == eltype(y) - @test Set(unique(p)) == Set(unique(y)) - nodummy || test_dummy_classif(m; binary=dummybinary, thresh=thresh) - m, f -end - -function simple_test_classif_prob(m, X, y; dummybinary=false, nodummy=false, thresh=0.75) - f, _, _ = fit(m, 1, X, y) - p = predict_mode(m, f, X) - @test eltype(p) == eltype(y) - @test Set(unique(p)) == Set(unique(y)) - p = predict(m, f, X) - @test eltype(p) <: UnivariateFinite - nodummy || test_dummy_classif(m; binary=dummybinary, thresh=thresh) - m, f -end