Merge pull request #597 from JuliaAI/levels

ablaom · web-flow · commit f13ee87efa5b · 2025-11-05T09:45:08.000+13:00
Bump compat CategoricalArrays="1"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -43,7 +43,7 @@ jobs:
       - uses: julia-actions/julia-runtest@v1
         env:
           # This environment variable enables the integration tests:
-          MLJ_TEST_REGISTRY: '1'
+          MLJ_TEST_REGISTRY: "false"
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v4
         with:
diff --git a/Project.toml b/Project.toml
@@ -28,8 +28,8 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-CategoricalArrays = "0.9, 0.10"
-CategoricalDistributions = "0.1"
+CategoricalArrays = "1"
+CategoricalDistributions = "0.2"
 Combinatorics = "1.0"
 Dates = "1"
 Distances = "0.9,0.10"
diff --git a/src/MLJModels.jl b/src/MLJModels.jl
@@ -22,8 +22,7 @@ using Combinatorics
 import Distributions
 import REPL # stdlib, needed for `Term`
 import PrettyPrinting
-import CategoricalDistributions: UnivariateFinite, UnivariateFiniteArray,
-    classes
+import CategoricalDistributions: UnivariateFinite, UnivariateFiniteArray
 import StatisticalTraits # for `info`
 
 # from loading.jl:
diff --git a/src/builtins/Constant.jl b/src/builtins/Constant.jl
@@ -55,7 +55,7 @@ function MLJModelInterface.fit(::ConstantClassifier,
                                y,
                                w=nothing)
     d = Distributions.fit(UnivariateFinite, y, w)
-    C = classes(d)
+    C = levels(d)
     fitresult = (C, Distributions.pdf([d, ], C))
     cache     = nothing
     report    = NamedTuple()
@@ -66,10 +66,10 @@ MLJModelInterface.fitted_params(::ConstantClassifier, fitresult) =
     (target_distribution=fitresult,)
 
 function MLJModelInterface.predict(::ConstantClassifier, fitresult, Xnew)
-    _classes, probs1 = fitresult
+    _levels, probs1 = fitresult
     N = nrows(Xnew)
-    probs = reshape(vcat(fill(probs1, N)...), N, length(_classes))
-    return UnivariateFinite(_classes, probs)
+    probs = reshape(vcat(fill(probs1, N)...), N, length(_levels))
+    return UnivariateFinite(_levels, probs)
 end
 
 
@@ -216,10 +216,11 @@ ConstantRegressor
 
 This "dummy" probabilistic predictor always returns the same distribution, irrespective of
 the provided input pattern. The distribution `d` returned is the `UnivariateFinite`
-distribution based on frequency of classes observed in the training target data. So,
-`pdf(d, level)` is the number of times the training target takes on the value `level`.
-Use `predict_mode` instead of `predict` to obtain the training target mode instead. For
-more on the `UnivariateFinite` type, see the CategoricalDistributions.jl package.
+distribution based on frequency of levels (classes) observed in the training target
+data. So, `pdf(d, level)` is the number of times the training target takes on the value
+`level`.  Use `predict_mode` instead of `predict` to obtain the training target mode
+instead. For more on the `UnivariateFinite` type, see the CategoricalDistributions.jl
+package.
 
 Almost any reasonable model is expected to outperform `ConstantClassifier`, which is used
 almost exclusively for testing and establishing performance baselines.
diff --git a/src/builtins/ThresholdPredictors.jl b/src/builtins/ThresholdPredictors.jl
@@ -56,16 +56,16 @@ const ThresholdSupported = Union{keys(_type_given_atom)...}
 
 const ERR_MODEL_UNSPECIFIED = ArgumentError(
     "Expecting atomic model as argument. None specified. ")
-warn_classes(first_class, second_class) =
+warn_levels(first_class, second_class) =
     "Taking positive class as `$(second_class)` and negative class as"*
     "`$(first_class)`."*
     "Coerce target to `OrderedFactor{2}` to suppress this warning, "*
     "ensuring that positive class > negative class. "
-const ERR_CLASSES_DETECTOR = ArgumentError(
+const ERR_LEVELS_DETECTOR = ArgumentError(
     "Targets for detector models must be ordered. Consider coercing to "*
     "`OrderedFactor`, ensuring that outlier class > inlier class. ")
 const ERR_TARGET_NOT_BINARY = ArgumentError(
-    "Target `y` must have two classes in its  pool, even if only one "*
+    "Target `y` must have two levels in its  pool, even if only one "*
     "class is manifest. ")
 const err_unsupported_model_type(T) = ArgumentError(
     "`BinaryThresholdPredictor` does not support atomic models with supertype `$T`. "*
@@ -208,9 +208,9 @@ function MMI.fit(model::ThresholdUnion, verbosity::Int, args...)
         length(L) == 2 || throw(ERR_TARGET_NOT_BINARY)
         first_class, second_class = L
         if model.model isa Probabilistic
-            @warn warn_classes(first_class, second_class)
+            @warn warn_levels(first_class, second_class)
         else
-            throw(ERR_CLASSES_DETECTOR)
+            throw(ERR_LEVELS_DETECTOR)
         end
     end
     model_fitresult, model_cache, model_report = MMI.fit(
@@ -259,7 +259,7 @@ function _predict_threshold(yhat::UnivariateFinite, threshold)
     dict = yhat.prob_given_ref
     length(threshold) == length(dict) || throw(
         ArgumentError(
-        "`length(threshold)` has to equal number of classes in specified "*
+        "`length(threshold)` has to equal number of levels in specified "*
         "`UnivariateFinite` distribution."
         )
     )
@@ -277,14 +277,14 @@ function _predict_threshold(yhat::UnivariateFiniteArray{S,V,R,P,N},
     dict = yhat.prob_given_ref
     length(threshold) == length(dict) || throw(
         ArgumentError(
-        "`length(threshold)` has to equal number of classes in specified "*
+        "`length(threshold)` has to equal number of levels in specified "*
         "`UnivariateFiniteArray`."
         )
     )
     d = yhat.decoder(1)
     levs = levels(d)
     ord = isordered(d)
-    # Array to house the predicted classes
+    # Array to house the predicted levels
     ret = CategoricalArray{V, N, R}(undef, size(yhat), levels=levs, ordered=ord)
     #ret = Array{CategoricalValue{V, R}, N}(undef, size(yhat))
     # `temp` vector allocted once to be used for calculations in each loop
diff --git a/test/builtins/Constant.jl b/test/builtins/Constant.jl
@@ -35,14 +35,14 @@ end
     d = MLJBase.UnivariateFinite([y[1], y[2], y[4]], [0.5, 0.25, 0.25])
 
     yhat = MLJBase.predict_mode(model, fitresult, X)
-    @test MLJBase.classes(yhat[1]) == MLJBase.classes(y[1])
+    @test levels(yhat[1]) == levels(y[1])
     @test yhat[5] == y[1]
     @test length(yhat) == 10
 
     yhat = MLJBase.predict(model, fitresult, X)
     yhat1 = yhat[1]
 
-    for c in MLJBase.classes(d)
+    for c in levels(d)
         Distributions.pdf(yhat1, c) ≈ Distributions.pdf(d, c)
     end
 
diff --git a/test/builtins/ThresholdPredictors.jl b/test/builtins/ThresholdPredictors.jl
@@ -27,9 +27,9 @@ y2_ = categorical(yraw[2:end], ordered=true)
     )
 
     # Check warning when `y` is not ordered:
-    @test_logs((:warn, MLJModels.warn_classes(levels(y_)...)),
+    @test_logs((:warn, MLJModels.warn_levels(levels(y_)...)),
                 MMI.fit(model, 1, MMI.reformat(model, X_, y1_)...))
-    # Check predictions containing two classes
+    # Check predictions containing two levels
     @test_throws ArgumentError BinaryThresholdPredictor(ConstantRegressor())
     @test_logs((:warn, r"`threshold` should be"),
                BinaryThresholdPredictor(atom, threshold=-1))
@@ -88,13 +88,13 @@ end
     v1 = categorical(['a', 'b', 'a'])
     v2 = categorical(['a', 'b', 'a', 'c'])
     # Test with UnivariateFinite object
-    d1 = UnivariateFinite(MMI.classes(v1), [0.4, 0.6])
+    d1 = UnivariateFinite(levels(v1), [0.4, 0.6])
     @test_throws ArgumentError MLJModels._predict_threshold(d1, 0.7)
     @test MLJModels._predict_threshold(d1, (0.7, 0.3)) == v1[2]
     @test MLJModels._predict_threshold(d1, [0.5, 0.5]) == v1[2]
     @test MLJModels._predict_threshold(d1, (0.4, 0.6)) == v1[1]
     @test MLJModels._predict_threshold(d1, [0.2, 0.8]) == v1[1]
-    d2 = UnivariateFinite(MMI.classes(v2), [0.4, 0.3, 0.3])
+    d2 = UnivariateFinite(levels(v2), [0.4, 0.3, 0.3])
     @test_throws ArgumentError MLJModels._predict_threshold(d2, (0.7, 0.3))
     @test MLJModels._predict_threshold(d2, (0.2, 0.5, 0.3)) == v2[1]
     @test MLJModels._predict_threshold(d2, [0.3, 0.2, 0.5]) == v2[2]
@@ -117,14 +117,14 @@ end
 
     # Test with UnivariateFiniteArray oject
     probs1 = [0.2 0.8; 0.7 0.3; 0.1 0.9]
-    unf_arr1 = UnivariateFinite(MMI.classes(v1), probs1)
+    unf_arr1 = UnivariateFinite(levels(v1), probs1)
     @test_throws ArgumentError MLJModels._predict_threshold(unf_arr1, 0.7)
     @test MLJModels._predict_threshold(unf_arr1, (0.7, 0.3)) == [v1[2], v1[1], v1[2]]
     @test MLJModels._predict_threshold(unf_arr1, [0.5, 0.5]) == [v1[2], v1[1], v1[2]]
     @test MLJModels._predict_threshold(unf_arr1, (0.4, 0.6)) == [v1[2], v1[1], v1[2]]
     @test MLJModels._predict_threshold(unf_arr1, [0.2, 0.8]) == [v1[1], v1[1], v1[2]]
     probs2 = [0.2 0.3 0.5;0.1 0.6 0.3; 0.4 0.0 0.6]
-    unf_arr2 = UnivariateFinite(MMI.classes(v2), probs2)
+    unf_arr2 = UnivariateFinite(levels(v2), probs2)
     @test_throws ArgumentError MLJModels._predict_threshold(unf_arr2, (0.7, 0.3))
     @test MLJModels._predict_threshold(unf_arr2, (0.2, 0.5, 0.3)) == [v2[4], v2[2], v2[1]]
     @test MLJModels._predict_threshold(unf_arr2, [0.3, 0.2, 0.5]) == [v2[2], v2[2], v2[1]]
@@ -144,7 +144,7 @@ MMI.input_scitype(::Type{<:DummyDetector}) = MMI.Table
 
 @testset "BinaryThresholdPredictor - ProbabilisticUnsupervisedDetector" begin
     detector = BinaryThresholdPredictor(DummyDetector(), threshold=0.2)
-    @test_throws MLJModels.ERR_CLASSES_DETECTOR MMI.fit(
+    @test_throws MLJModels.ERR_LEVELS_DETECTOR MMI.fit(
         detector, 1, MMI.reformat(detector, X_, y1_)...
     )
 
diff --git a/test/testutils.jl b/test/testutils.jl