From 40fd7733f26d69a05520f0a1f8880dd01235b797 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Fri, 11 Oct 2024 14:18:13 +1300
Subject: [PATCH 1/4] add a perceptron classifier to examples

---
 docs/src/common_implementation_patterns.md |   4 +-
 docs/src/patterns/classification.md        |   4 +
 docs/src/patterns/gradient_descent.md      |   5 +
 docs/src/patterns/iterative_algorithms.md  |   8 +-
 test/integration/gradient_descent.jl       | 386 +++++++++++++++++++++
 5 files changed, 402 insertions(+), 5 deletions(-)
 create mode 100644 docs/src/patterns/gradient_descent.md
 create mode 100644 test/integration/gradient_descent.jl

diff --git a/docs/src/common_implementation_patterns.md b/docs/src/common_implementation_patterns.md
index 7a5f357c..29e67ca8 100644
--- a/docs/src/common_implementation_patterns.md
+++ b/docs/src/common_implementation_patterns.md
@@ -22,12 +22,12 @@ implementations fall into one (or more) of the following informally understood p
 
 - [Regression](@ref): Supervised learners for continuous targets
 
-- Classification: Supervised learners for categorical targets 
+- [Classification](@ref): Supervised learners for categorical targets 
 
 - Clusterering: Algorithms that group data into clusters for classification and
   possibly dimension reduction. May be true learners (generalize to new data) or static.
 
-- Gradient Descent: Including neural networks.
+- [Gradient Descent](@ref): Including neural networks.
 
 - [Iterative Algorithms](@ref)
 
diff --git a/docs/src/patterns/classification.md b/docs/src/patterns/classification.md
index 4e8066d9..86fa1158 100644
--- a/docs/src/patterns/classification.md
+++ b/docs/src/patterns/classification.md
@@ -1 +1,5 @@
 # Classification
+
+See these examples from tests:
+
+- [perceptron classifier](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/gradient_descent.jl)
diff --git a/docs/src/patterns/gradient_descent.md b/docs/src/patterns/gradient_descent.md
new file mode 100644
index 00000000..acded653
--- /dev/null
+++ b/docs/src/patterns/gradient_descent.md
@@ -0,0 +1,5 @@
+# Gradient Descent
+
+See [this
+example](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/gradient_descent.jl)
+from tests.
diff --git a/docs/src/patterns/iterative_algorithms.md b/docs/src/patterns/iterative_algorithms.md
index be2a0c7f..abab6316 100644
--- a/docs/src/patterns/iterative_algorithms.md
+++ b/docs/src/patterns/iterative_algorithms.md
@@ -1,5 +1,7 @@
 # Iterative Algorithms
 
-See [this
-example](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/ensembling.jl)
-from tests.
+See these examples from tests:
+
+- [bagged ensembling](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/ensembling.jl)
+
+- [perceptron classifier](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/gradient_descent.jl)
diff --git a/test/integration/gradient_descent.jl b/test/integration/gradient_descent.jl
new file mode 100644
index 00000000..6b582225
--- /dev/null
+++ b/test/integration/gradient_descent.jl
@@ -0,0 +1,386 @@
+using Pkg
+Pkg.activate("perceptron", shared=true)
+
+using LearnAPI
+using Random
+using Statistics
+using StableRNGs
+import Optimisers
+import Zygote
+import NNlib
+import CategoricalDistributions
+import CategoricalDistributions: pdf, mode
+import ComponentArrays
+
+# # PERCEPTRON
+
+# We implement a simple perceptron classifier to illustrate some common patterns for
+# gradient descent algorithms. This includes implementation of the following methods:
+
+# - `update`
+# - `update_observations`
+# - `iteration_parameter`
+# - `training_losses`
+# - `obs` for pre-processing (non-tabular) classification training data
+# - `predict(algorithm, ::Distribution, Xnew)`
+
+# For simplicity, we use single-observation batches for gradient descent updates, and we
+# may dodge some standard optimizations.
+
+# This is also an example of a probability-predicting classifier.
+
+
+# ## Helpers
+
+"""
+    brier_loss(probs, hot)
+
+Return Brier (quadratic) loss.
+
+- `probs`: predicted probability vector
+- `hot`: corresponding ground truth observation, as a one-hot encoded bit vector
+
+"""
+function brier_loss(probs, hot)
+    offset = 1 + sum(probs.^2)
+    return offset - 2*(sum(probs.*hot))
+end
+
+"""
+    corefit(perceptron, optimiser, X, y_hot, epochs, state, verbosity)
+
+Return updated `perceptron`, `state` and training losses by carrying out gradient descent
+for the specified number of `epochs`.
+
+- `perceptron`: component array with components `weights` and `bias`
+- `optimiser`: optimiser from Optimiser.jl
+- `X`: feature matrix, of size (p, n)
+- `y_hot`: one-hot encoded target, of size (nclasses, n)
+- `epochs`: number of epochs
+- `state`: optimiser state
+
+"""
+function corefit(perceptron, X, y_hot, epochs, state, verbosity)
+    n = size(y_hot) |> last
+    losses = map(1:epochs) do _
+        total_loss = zero(Float32)
+        for i in 1:n
+            loss, grad = Zygote.withgradient(perceptron) do p
+                probs = p.weights*X[:,i] + p.bias |> NNlib.softmax
+                brier_loss(probs, y_hot[:,i])
+            end
+            ∇loss = only(grad)
+            state, perceptron = Optimisers.update(state, perceptron, ∇loss)
+            total_loss += loss
+        end
+        # make some noise, if allowed:
+        verbosity > 0 && @info "Training loss: $total_loss"
+        total_loss
+    end
+    return perceptron, state, losses
+end
+
+
+# ## Implementation
+
+# ### Algorithm
+
+# no docstring here - that goes with the constructor;
+# SOME FIELDS LEFT ABSTRACT FOR SIMPLICITY
+struct PerceptronClassifier
+    epochs::Int
+    optimiser # an optmiser from Optimsers.jl
+    rng
+end
+
+"""
+    PerceptronClassifier(; epochs=50, optimiser=Optimisers.Adam(), rng=Random.default_rng())
+
+Instantiate a perceptron classifier.
+
+Train an instance, `algorithm`, by doing `model = fit(algorithm, X, y)`, where
+
+-  `X is a `Float32` matrix, with observations-as-columns
+-  `y` (target) is some one-dimensional `CategoricalArray`.
+
+Get probabilistic predictions with `predict(model, Xnew)` and
+point predictions with `predict(model, Point(), Xnew)`.
+
+# Warm restart options
+
+    update_observations(model, newdata; replacements...)
+
+Return an updated model, with the weights and bias of the previously learned perceptron
+used as the starting state in new gradient descent updates. Adopt any specified
+hyperparameter `replacements` (properties of `LearnAPI.algorithm(model)`).
+
+    update(model, newdata; epochs=n, replacements...)
+
+If `Δepochs = n - perceptron.epochs` is non-negative, then return an updated model, with
+the weights and bias of the previously learned perceptron used as the starting state in
+new gradient descent updates for `Δepochs` epochs, and using the provided `newdata`
+instead of the previous training data. Any other hyperparaameter `replacements` are also
+adopted. In `Δepochs` is negative or not specified, instead return `fit(algorithm,
+newdata)`, where `algorithm=LearnAPI.clone(algorithm; epochs=n, replacements....)`.
+
+"""
+PerceptronClassifier(; epochs=50, optimiser=Optimisers.Adam(), rng=Random.default_rng()) =
+    PerceptronClassifier(epochs, optimiser, rng)
+
+
+# ### Data interface
+
+# For raw training data:
+LearnAPI.target(algorithm::PerceptronClassifier, data::Tuple) = last(data)
+
+# For wrapping pre-processed training data (output of `obs(algorithm, data)`):
+struct PerceptronClassifierObservations
+    X::Matrix{Float32}
+    y_hot::BitMatrix  # one-hot encoded target
+    classes           # the (ordered) pool of `y`, as `CategoricalValue`s
+end
+
+# For pre-processing the training data:
+function LearnAPI.obs(algorithm::PerceptronClassifier, data::Tuple)
+    X, y = data
+    classes = CategoricalDistributions.classes(y)
+    y_hot = classes .== permutedims(y) # one-hot encoding
+    return PerceptronClassifierObservations(X, y_hot, classes)
+end
+
+# implement `RadomAccess()` interface for output of `obs`:
+Base.length(observations::PerceptronClassifierObservations) = length(observations.y)
+Base.getindex(observations, I) = PerceptronClassifierObservations(
+    (@view observations.X[:, I]),
+    (@view observations.y[I]),
+    observations.classes,
+)
+
+LearnAPI.target(
+    algorithm::PerceptronClassifier,
+    observations::PerceptronClassifierObservations,
+) = observations.y
+
+LearnAPI.features(
+    algorithm::PerceptronClassifier,
+    observations::PerceptronClassifierObservations,
+) = observations.X
+
+# Note that data consumed by `predict` needs no pre-processing, so no need to overload
+# `obs(model, data)`.
+
+
+# ### Fitting and updating
+
+# For wrapping outcomes of learning:
+struct PerceptronClassifierFitted
+    algorithm::PerceptronClassifier
+    perceptron  # component array storing weights and bias
+    state       # optimiser state
+    classes     # target classes
+    losses
+end
+
+LearnAPI.algorithm(model::PerceptronClassifierFitted) = model.algorithm
+
+# `fit` for pre-processed data (output of `obs(algorithm, data)`):
+function LearnAPI.fit(
+    algorithm::PerceptronClassifier,
+    observations::PerceptronClassifierObservations;
+    verbosity=1,
+    )
+
+    # unpack hyperparameters:
+    epochs = algorithm.epochs
+    optimiser = algorithm.optimiser
+    rng = deepcopy(algorithm.rng) # to prevent mutation of `algorithm`!
+
+    # unpack data:
+    X = observations.X
+    y_hot = observations.y_hot
+    classes = observations.classes
+    nclasses = length(classes)
+
+    # initialize bias and weights:
+    weights = randn(rng, Float32, nclasses, p)
+    bias = zeros(Float32, nclasses)
+    perceptron = (; weights, bias) |> ComponentArrays.ComponentArray
+
+    # initialize optimiser:
+    state = Optimisers.setup(optimiser, perceptron)
+
+    perceptron, state, losses = corefit(perceptron, X, y_hot, epochs, state, verbosity)
+
+    return PerceptronClassifierFitted(algorithm, perceptron, state, classes, losses)
+end
+
+# `fit` for unprocessed data:
+LearnAPI.fit(algorithm::PerceptronClassifier, data; kwargs...) =
+    fit(algorithm, obs(algorithm, data); kwargs...)
+
+# see the `PerceptronClassifier` docstring for `update_observations` logic.
+function LearnAPI.update_observations(
+    model::PerceptronClassifierFitted,
+    observations_new::PerceptronClassifierObservations;
+    verbosity=1,
+    replacements...,
+    )
+
+    # unpack data:
+    X = observations.X
+    y_hot = observations.y_hot
+    classes = observations.classes
+    nclasses = length(classes)
+
+    classes == model.classes || error("New training target has incompatible classes.")
+
+    algorithm_old = LearnAPI.algorithm(model)
+    algorithm = LearnAPI.clone(algorithm_old; replacements...)
+
+    perceptron = model.perceptron
+    state = model.state
+    losses = model.losses
+    epochs = algorithm.epochs
+
+    perceptron, state, losses_new = corefit(perceptron, X, y_hot, epochs, state, verbosity)
+    losses = vcat(losses, losses_new)
+
+    return PerceptronClassifierFitted(algorithm, perceptron, state, classes, losses)
+end
+LearnAPI.update_observations(model::PerceptronClassifierFitted, data; kwargs...) =
+    update_observations(model, obs(LearnAPI.algorithm(model), data); kwargs...)
+
+# see the `PerceptronClassifier` docstring for `update` logic.
+function LearnAPI.update(
+    model::PerceptronClassifierFitted,
+    observations::PerceptronClassifierObservations;
+    verbosity=1,
+    replacements...,
+    )
+
+    # unpack data:
+    X = observations.X
+    y_hot = observations.y_hot
+    classes = observations.classes
+    nclasses = length(classes)
+
+    classes == model.classes || error("New training target has incompatible classes.")
+
+    algorithm_old = LearnAPI.algorithm(model)
+    algorithm = LearnAPI.clone(algorithm_old; replacements...)
+    :epochs in keys(replacements) || return fit(algorithm, observations)
+
+    perceptron = model.perceptron
+    state = model.state
+    losses = model.losses
+
+    epochs = algorithm.epochs
+    Δepochs = epochs - algorithm_old.epochs
+    epochs < 0 && return fit(model, algorithm)
+
+    perceptron, state, losses_new = corefit(perceptron, X, y_hot, Δepochs, state, verbosity)
+    losses = vcat(losses, losses_new)
+
+    return PerceptronClassifierFitted(algorithm, perceptron, state, classes, losses)
+end
+LearnAPI.update(model::PerceptronClassifierFitted, data; kwargs...) =
+    update(model, obs(LearnAPI.algorithm(model), data); kwargs...)
+
+
+# ### Predict
+
+function LearnAPI.predict(model::PerceptronClassifierFitted, ::Distribution, Xnew)
+    perceptron = model.perceptron
+    classes = model.classes
+    probs = perceptron.weights*Xnew .+ perceptron.bias |> NNlib.softmax
+    return CategoricalDistributions.UnivariateFinite(classes, probs')
+end
+
+LearnAPI.predict(model::PerceptronClassifierFitted, ::Point, Xnew) =
+    mode.(predict(model, Distribution(), Xnew))
+
+
+# ### Accessor functions
+
+LearnAPI.training_losses(model::PerceptronClassifierFitted) = model.losses
+
+
+# ### Traits
+
+@trait(
+    PerceptronClassifier,
+    constructor = PerceptronClassifier,
+    iteration_parameter = :epochs,
+    kinds_of_proxy = (Distribution(), Point()),
+    tags = ("classification", "iterative algorithms", "incremental algorithms"),
+    functions = (
+        :(LearnAPI.fit),
+        :(LearnAPI.algorithm),
+        :(LearnAPI.minimize),
+        :(LearnAPI.obs),
+        :(LearnAPI.features),
+        :(LearnAPI.target),
+        :(LearnAPI.update),
+        :(LearnAPI.update_observations),
+        :(LearnAPI.predict),
+        :(LearnAPI.training_losses),
+   )
+)
+
+
+# ## Tests
+
+# synthetic test data:
+N = 10
+n = 10N # number of observations
+p = 2   # number of features
+train = 1:6N
+test = (6N+1:10N)
+rng = StableRNG(123)
+X = randn(rng, Float32, p, n);
+coefficients = rand(rng, Float32, p)'
+y_continuous = coefficients*X |> vec
+η1 = quantile(y_continuous, 1/3)
+η2 = quantile(y_continuous, 2/3)
+y = map(y_continuous) do η
+    η < η1 && return "A"
+    η < η2 && return "B"
+    "C"
+end |> CategoricalDistributions.categorical;
+Xtrain = X[:, train];
+Xtest = X[:, test];
+ytrain = y[train];
+ytest = y[test];
+
+@testset "PerceptronClassfier" begin
+    rng = StableRNG(123)
+    algorithm = PerceptronClassifier(; optimiser=Optimisers.Adam(0.01), epochs=40, rng)
+    @test LearnAPI.clone(algorithm) == algorithm
+    @test :(LearnAPI.update) in LearnAPI.functions(algorithm)
+    @test LearnAPI.target(algorithm, (X, y)) == y
+    @test LearnAPI.features(algorithm, (X, y)) == X
+
+    model40 = fit(algorithm, Xtrain, ytrain; verbosity=0)
+
+    # 40 epochs is sufficient for 90% accuracy in this case:
+    @test sum(predict(model40, Point(), Xtest) .== ytest)/length(ytest) > 0.9
+
+    # get probabilistic predictions:
+    ŷ40 = predict(model40, Distribution(), Xtest);
+    @test predict(model40, Xtest) ≈ ŷ40
+
+    # add 30 epochs in an `update`:
+    model70 = update(model40, Xtrain, y[train]; verbosity=0, epochs=70)
+    ŷ70 = predict(model70, Xtest);
+    @test !(ŷ70 ≈ ŷ40)
+
+    # compare with cold restart:
+    model = fit(LearnAPI.clone(algorithm; epochs=70), Xtrain, y[train]; verbosity=0);
+    @test ŷ70 ≈ predict(model, Xtest)
+
+    # instead add 30 epochs using `update_observations` instead:
+    model70b = update_observations(model40, Xtrain, y[train]; verbosity=0, epochs=30)
+    @test ŷ70 ≈ predict(model70b, Xtest) ≈ predict(model, Xtest)
+end
+
+true

From 09290666b9ab1e8d0c4e972b949ab61f2e386f5c Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Fri, 11 Oct 2024 14:22:05 +1300
Subject: [PATCH 2/4] update roadmap

---
 ROADMAP.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ROADMAP.md b/ROADMAP.md
index cd524ba5..7da578ae 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -16,7 +16,7 @@
   - [x] regression
   - [ ] classification
   - [ ] clustering
-  - [ ] gradient descent
+  - [x] gradient descent
   - [x] iterative algorithms
   - [ ] incremental algorithms
   - [ ] dimension reduction

From 671491c8db8078ddacaf65ccf32ceaaf82a694a9 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Fri, 11 Oct 2024 14:31:09 +1300
Subject: [PATCH 3/4] fix outdated file reference in tests

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index dee0c17b..2c66588d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,7 +6,7 @@ test_files = [
     "clone.jl",
     "integration/regression.jl",
     "integration/static_algorithms.jl",
-    "integration/iterative_algorithms.jl",
+    "integration/ensembling.jl",
 ]
 
 files = isempty(ARGS) ? test_files : ARGS

From 458b03cb12867632fdcd7aca8da0229e62c22d88 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Fri, 11 Oct 2024 14:52:36 +1300
Subject: [PATCH 4/4] fix flawed test

---
 test/integration/ensembling.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/integration/ensembling.jl b/test/integration/ensembling.jl
index 1e92e203..657f0d29 100644
--- a/test/integration/ensembling.jl
+++ b/test/integration/ensembling.jl
@@ -194,13 +194,14 @@ Xtest = Tables.subset(X, test)
     ŷ7 = predict(model, Xtest)
 
     # compare with cold restart:
-    model = fit(LearnAPI.clone(algorithm; n=7), Xtrain, y[train]; verbosity=0);
-    @test ŷ7 ≈ predict(model, Xtest)
+    model_cold = fit(LearnAPI.clone(algorithm; n=7), Xtrain, y[train]; verbosity=0);
+    @test ŷ7 ≈ predict(model_cold, Xtest)
 
-    # test cold restart if another hyperparameter is changed:
+    # test that we get a cold restart if another hyperparameter is changed:
     model2 = update(model, Xtrain, y[train]; atom=Ridge(0.05))
-    algorithm2 = LearnAPI.clone(LearnAPI.algorithm(model); atom=Ridge(0.05))
-    @test predict(model, Xtest) ≈ predict(model2, Xtest)
+    algorithm2 = Ensemble(Ridge(0.05); n=7, rng)
+    model_cold = fit(algorithm2, Xtrain, y[train]; verbosity=0)
+    @test predict(model2, Xtest) ≈ predict(model_cold, Xtest)
 
 end