add learners/classification.jl and tests

ablaom · ablaom · commit 7e5f65106a91 · 2025-02-14T21:43:06.000+13:00
diff --git a/Project.toml b/Project.toml
@@ -4,6 +4,8 @@ authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
 version = "0.2.1"
 
 [deps]
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
+CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 IsURL = "ceb4388c-583f-448d-bb30-00b11e8c5682"
@@ -23,6 +25,8 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [compat]
+CategoricalArrays = "0.10.8"
+CategoricalDistributions = "0.1.15"
 Distributions = "0.25"
 InteractiveUtils = "<0.0.1, 1"
 IsURL = "0.2.0"
@@ -47,7 +51,16 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [targets]
-test = ["DataFrames", "Distributions", "Random", "LinearAlgebra", "Statistics", "Tables"]
+test = [
+  "DataFrames",
+  "Distributions",
+  "Random",
+  "LinearAlgebra",
+  "Statistics",
+  "StatsModels",
+  "Tables",
+  ]
diff --git a/src/LearnTestAPI.jl b/src/LearnTestAPI.jl
@@ -53,6 +53,7 @@ include("logging.jl")
 include("testapi.jl")
 include("learners/static_algorithms.jl")
 include("learners/regression.jl")
+include("learners/classification.jl")
 include("learners/ensembling.jl")
 # next learner excluded because of heavy dependencies:
 # include("learners/gradient_descent.jl")
diff --git a/src/learners/classification.jl b/src/learners/classification.jl
@@ -0,0 +1,117 @@
+# This file defines `ConstantClassifier()`
+
+using LearnAPI
+import LearnDataFrontEnds as FrontEnds
+import MLCore
+import CategoricalArrays
+import CategoricalDistributions
+import CategoricalDistributions.OrderedCollections.OrderedDict
+import CategoricalDistributions.Distributions.StatsBase.proportionmap
+
+# The implementation of a constant classifier below is not the simplest, but it
+# demonstrates some patterns that apply more generally in classification.
+
+"""
+    ConstantClassifier()
+
+Instantiate a constant (dummy) classifier. Can predict `Point` or `Distribution` targets.
+
+"""
+struct ConstantClassifier end
+
+struct ConstantClassifierFitted
+    learner::ConstantClassifier
+    probabilities
+    names::Vector{Symbol}
+    classes_seen
+    codes_seen
+    decoder
+end
+
+LearnAPI.learner(model::ConstantClassifierFitted) = model.learner
+
+# add a data front end; `obs` will return objects with type `FrontEnds.Obs`:
+const front_end = FrontEnds.Sage(code_type=:small)
+LearnAPI.obs(learner::ConstantClassifier, data) =
+    FrontEnds.fitobs(learner, data, front_end)
+LearnAPI.obs(model::ConstantClassifierFitted, data) =
+    obs(model, data, front_end)
+
+# data deconstructors:
+LearnAPI.features(learner::ConstantClassifier, data) =
+    LearnAPI.features(learner, data, front_end)
+LearnAPI.target(learner::ConstantClassifier, data) =
+    LearnAPI.target(learner, data, front_end)
+
+function LearnAPI.fit(learner::ConstantClassifier, observations::FrontEnds.Obs; verbosity=1)
+    y = observations.target # integer "codes"
+    names = observations.names
+    classes_seen = observations.classes_seen
+    codes_seen = sort(unique(y))
+    decoder = observations.decoder
+
+    d = proportionmap(y)
+    # proportions ordered by key, i.e., by codes seen:
+    probabilities = values(sort!(OrderedDict(d))) |> collect
+
+    return ConstantClassifierFitted(
+        learner,
+        probabilities,
+        names,
+        classes_seen,
+        codes_seen,
+        decoder,
+    )
+end
+LearnAPI.fit(learner::ConstantClassifier, data; kwargs...) =
+    fit(learner, obs(learner, data); kwargs...)
+
+function LearnAPI.predict(
+    model::ConstantClassifierFitted,
+    ::Point,
+    observations::FrontEnds.Obs,
+    )
+    n = MLCore.numobs(observations)
+    idx = argmax(model.probabilities)
+    code_of_mode = model.codes_seen[idx]
+    return model.decoder.(fill(code_of_mode, n))
+end
+LearnAPI.predict(model::ConstantClassifierFitted, ::Point, data) =
+    predict(model, Point(), obs(model, data))
+
+function LearnAPI.predict(
+    model::ConstantClassifierFitted,
+    ::Distribution,
+    observations::FrontEnds.Obs,
+    )
+    n = MLCore.numobs(observations)
+    probs = model.probabilities
+    # repeat vertically to get rows of a matrix:
+    probs_matrix = reshape(repeat(probs, n), (length(probs), n))'
+    return CategoricalDistributions.UnivariateFinite(model.classes_seen, probs_matrix)
+end
+LearnAPI.predict(model::ConstantClassifierFitted, ::Distribution, data) =
+        predict(model, Distribution(), obs(model, data))
+
+# accessor function:
+LearnAPI.feature_names(model::ConstantClassifierFitted) = model.names
+
+@trait(
+    ConstantClassifier,
+    constructor = ConstantClassifier,
+    kinds_of_proxy = (Point(),Distribution()),
+    tags = ("classification",),
+    functions = (
+        :(LearnAPI.fit),
+        :(LearnAPI.learner),
+        :(LearnAPI.clone),
+        :(LearnAPI.strip),
+        :(LearnAPI.obs),
+        :(LearnAPI.features),
+        :(LearnAPI.target),
+        :(LearnAPI.predict),
+        :(LearnAPI.feature_names),
+   )
+)
+
+true
diff --git a/src/learners/ensembling.jl b/src/learners/ensembling.jl
@@ -211,7 +211,7 @@ LearnAPI.components(model::EnsembleFitted) = [:atom => model.models,]
 # - `out_of_sample_losses`
 
 # For simplicity, this implementation is restricted to univariate features. The simplistic
-# algorithm is explained in the docstring.  of the data presented.
+# algorithm is explained in the docstring.
 
 
 # ## HELPERS
@@ -276,6 +276,7 @@ function update!(
     stump = Stump(ξ, left, right)
     push!(forest, stump)
     new_predictions = _predict(stump, x)
+
     # efficient in-place update of `predictions`:
     predictions .= (k*predictions .+ new_predictions)/(k + 1)
     push!(training_losses, (predictions[training_indices] .- ytrain).^2 |> sum)
diff --git a/test/learners/classification.jl b/test/learners/classification.jl
@@ -0,0 +1,57 @@
+using Test
+using LearnTestAPI
+using LearnAPI
+import MLCore
+using StableRNGs
+import DataFrames
+using Tables
+import CategoricalArrays
+import StatsModels: @formula
+import CategoricalDistributions.pdf
+
+# # SYNTHESIZE LOTS OF DATASETS
+
+n = 2
+rng = StableRNG(345)
+# has a "hidden" level, `C`:
+t = CategoricalArrays.categorical(repeat("ABA", 3n)*"CC" |> collect)[1:3n]
+c, a = randn(rng, 3n), rand(rng, 3n)
+y = t
+Y = (; t)
+
+# feature matrix:
+x = hcat(c, a) |> permutedims
+
+# feature tables:
+X = (; c, a)
+X1, X2, X3, X4, X5 = X,
+Tables.rowtable(X),
+Tables.dictrowtable(X),
+Tables.dictcolumntable(X),
+DataFrames.DataFrame(X);
+
+# full tables:
+T = (; c, t, a)
+T1, T2, T3, T4, T5 = T,
+    Tables.rowtable(T),
+    Tables.dictrowtable(T),
+    Tables.dictcolumntable(T),
+    DataFrames.DataFrame(T);
+
+# StatsModels.jl @formula:
+f = @formula(t ~ c + a)
+
+
+# # TESTS
+
+learner = LearnTestAPI.ConstantClassifier()
+@testapi learner (X1, y)
+@testapi learner (X2, y) (X3, y) (X4, y) (T1, :t) (T2, :t) (T3, f) (T4, f) verbosity=0
+
+@testset "extra tests for constant classifier" begin
+    model = fit(learner, (x, y))
+    @test predict(model, x) == fill('A', 3n)
+    @test pdf.(predict(model, Distribution(), x), 'A') ≈ fill(2/3, 3n)
+end
+
+true
diff --git a/test/learners/ensembling.jl b/test/learners/ensembling.jl
@@ -30,7 +30,7 @@ learner = LearnTestAPI.Ensemble(atom; n=4, rng)
 @testset "extra tests for ensemble" begin
     @test LearnAPI.clone(learner) == learner
     @test LearnAPI.target(learner, data) == y
-    @test LearnAPI.features(learner, data) == X
+    @test LearnAPI.features(learner, data).features == Tables.matrix(X)'
 
     model = @test_logs(
         (:info, r"Trained 4 ridge"),
diff --git a/test/learners/regression.jl b/test/learners/regression.jl
@@ -22,7 +22,7 @@ data = (X, y)
 learner = LearnTestAPI.Ridge(lambda=0.5)
 @testapi learner data verbosity=1
 
-@testset "extra tests for ridge regression" begin
+@testset "extra tests for ridge regressor" begin
     @test :(LearnAPI.obs) in LearnAPI.functions(learner)
 
     @test LearnAPI.target(learner, data) == y
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,6 +4,7 @@ test_files = [
     "tools.jl",
     "learners/static_algorithms.jl",
     "learners/regression.jl",
+    "learners/classification.jl",
     "learners/ensembling.jl",
 #    "learners/gradient_descent.jl",
     "learners/incremental_algorithms.jl",