From 40fd7733f26d69a05520f0a1f8880dd01235b797 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 11 Oct 2024 14:18:13 +1300 Subject: [PATCH 1/4] add a perceptron classifier to examples --- docs/src/common_implementation_patterns.md | 4 +- docs/src/patterns/classification.md | 4 + docs/src/patterns/gradient_descent.md | 5 + docs/src/patterns/iterative_algorithms.md | 8 +- test/integration/gradient_descent.jl | 386 +++++++++++++++++++++ 5 files changed, 402 insertions(+), 5 deletions(-) create mode 100644 docs/src/patterns/gradient_descent.md create mode 100644 test/integration/gradient_descent.jl diff --git a/docs/src/common_implementation_patterns.md b/docs/src/common_implementation_patterns.md index 7a5f357c..29e67ca8 100644 --- a/docs/src/common_implementation_patterns.md +++ b/docs/src/common_implementation_patterns.md @@ -22,12 +22,12 @@ implementations fall into one (or more) of the following informally understood p - [Regression](@ref): Supervised learners for continuous targets -- Classification: Supervised learners for categorical targets +- [Classification](@ref): Supervised learners for categorical targets - Clusterering: Algorithms that group data into clusters for classification and possibly dimension reduction. May be true learners (generalize to new data) or static. -- Gradient Descent: Including neural networks. +- [Gradient Descent](@ref): Including neural networks. - [Iterative Algorithms](@ref) diff --git a/docs/src/patterns/classification.md b/docs/src/patterns/classification.md index 4e8066d9..86fa1158 100644 --- a/docs/src/patterns/classification.md +++ b/docs/src/patterns/classification.md @@ -1 +1,5 @@ # Classification + +See these examples from tests: + +- [perceptron classifier](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/gradient_descent.jl) diff --git a/docs/src/patterns/gradient_descent.md b/docs/src/patterns/gradient_descent.md new file mode 100644 index 00000000..acded653 --- /dev/null +++ b/docs/src/patterns/gradient_descent.md @@ -0,0 +1,5 @@ +# Gradient Descent + +See [this +example](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/gradient_descent.jl) +from tests. diff --git a/docs/src/patterns/iterative_algorithms.md b/docs/src/patterns/iterative_algorithms.md index be2a0c7f..abab6316 100644 --- a/docs/src/patterns/iterative_algorithms.md +++ b/docs/src/patterns/iterative_algorithms.md @@ -1,5 +1,7 @@ # Iterative Algorithms -See [this -example](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/ensembling.jl) -from tests. +See these examples from tests: + +- [bagged ensembling](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/ensembling.jl) + +- [perceptron classifier](https://github.com/JuliaAI/LearnAPI.jl/blob/dev/test/integration/gradient_descent.jl) diff --git a/test/integration/gradient_descent.jl b/test/integration/gradient_descent.jl new file mode 100644 index 00000000..6b582225 --- /dev/null +++ b/test/integration/gradient_descent.jl @@ -0,0 +1,386 @@ +using Pkg +Pkg.activate("perceptron", shared=true) + +using LearnAPI +using Random +using Statistics +using StableRNGs +import Optimisers +import Zygote +import NNlib +import CategoricalDistributions +import CategoricalDistributions: pdf, mode +import ComponentArrays + +# # PERCEPTRON + +# We implement a simple perceptron classifier to illustrate some common patterns for +# gradient descent algorithms. This includes implementation of the following methods: + +# - `update` +# - `update_observations` +# - `iteration_parameter` +# - `training_losses` +# - `obs` for pre-processing (non-tabular) classification training data +# - `predict(algorithm, ::Distribution, Xnew)` + +# For simplicity, we use single-observation batches for gradient descent updates, and we +# may dodge some standard optimizations. + +# This is also an example of a probability-predicting classifier. + + +# ## Helpers + +""" + brier_loss(probs, hot) + +Return Brier (quadratic) loss. + +- `probs`: predicted probability vector +- `hot`: corresponding ground truth observation, as a one-hot encoded bit vector + +""" +function brier_loss(probs, hot) + offset = 1 + sum(probs.^2) + return offset - 2*(sum(probs.*hot)) +end + +""" + corefit(perceptron, optimiser, X, y_hot, epochs, state, verbosity) + +Return updated `perceptron`, `state` and training losses by carrying out gradient descent +for the specified number of `epochs`. + +- `perceptron`: component array with components `weights` and `bias` +- `optimiser`: optimiser from Optimiser.jl +- `X`: feature matrix, of size (p, n) +- `y_hot`: one-hot encoded target, of size (nclasses, n) +- `epochs`: number of epochs +- `state`: optimiser state + +""" +function corefit(perceptron, X, y_hot, epochs, state, verbosity) + n = size(y_hot) |> last + losses = map(1:epochs) do _ + total_loss = zero(Float32) + for i in 1:n + loss, grad = Zygote.withgradient(perceptron) do p + probs = p.weights*X[:,i] + p.bias |> NNlib.softmax + brier_loss(probs, y_hot[:,i]) + end + ∇loss = only(grad) + state, perceptron = Optimisers.update(state, perceptron, ∇loss) + total_loss += loss + end + # make some noise, if allowed: + verbosity > 0 && @info "Training loss: $total_loss" + total_loss + end + return perceptron, state, losses +end + + +# ## Implementation + +# ### Algorithm + +# no docstring here - that goes with the constructor; +# SOME FIELDS LEFT ABSTRACT FOR SIMPLICITY +struct PerceptronClassifier + epochs::Int + optimiser # an optmiser from Optimsers.jl + rng +end + +""" + PerceptronClassifier(; epochs=50, optimiser=Optimisers.Adam(), rng=Random.default_rng()) + +Instantiate a perceptron classifier. + +Train an instance, `algorithm`, by doing `model = fit(algorithm, X, y)`, where + +- `X is a `Float32` matrix, with observations-as-columns +- `y` (target) is some one-dimensional `CategoricalArray`. + +Get probabilistic predictions with `predict(model, Xnew)` and +point predictions with `predict(model, Point(), Xnew)`. + +# Warm restart options + + update_observations(model, newdata; replacements...) + +Return an updated model, with the weights and bias of the previously learned perceptron +used as the starting state in new gradient descent updates. Adopt any specified +hyperparameter `replacements` (properties of `LearnAPI.algorithm(model)`). + + update(model, newdata; epochs=n, replacements...) + +If `Δepochs = n - perceptron.epochs` is non-negative, then return an updated model, with +the weights and bias of the previously learned perceptron used as the starting state in +new gradient descent updates for `Δepochs` epochs, and using the provided `newdata` +instead of the previous training data. Any other hyperparaameter `replacements` are also +adopted. In `Δepochs` is negative or not specified, instead return `fit(algorithm, +newdata)`, where `algorithm=LearnAPI.clone(algorithm; epochs=n, replacements....)`. + +""" +PerceptronClassifier(; epochs=50, optimiser=Optimisers.Adam(), rng=Random.default_rng()) = + PerceptronClassifier(epochs, optimiser, rng) + + +# ### Data interface + +# For raw training data: +LearnAPI.target(algorithm::PerceptronClassifier, data::Tuple) = last(data) + +# For wrapping pre-processed training data (output of `obs(algorithm, data)`): +struct PerceptronClassifierObservations + X::Matrix{Float32} + y_hot::BitMatrix # one-hot encoded target + classes # the (ordered) pool of `y`, as `CategoricalValue`s +end + +# For pre-processing the training data: +function LearnAPI.obs(algorithm::PerceptronClassifier, data::Tuple) + X, y = data + classes = CategoricalDistributions.classes(y) + y_hot = classes .== permutedims(y) # one-hot encoding + return PerceptronClassifierObservations(X, y_hot, classes) +end + +# implement `RadomAccess()` interface for output of `obs`: +Base.length(observations::PerceptronClassifierObservations) = length(observations.y) +Base.getindex(observations, I) = PerceptronClassifierObservations( + (@view observations.X[:, I]), + (@view observations.y[I]), + observations.classes, +) + +LearnAPI.target( + algorithm::PerceptronClassifier, + observations::PerceptronClassifierObservations, +) = observations.y + +LearnAPI.features( + algorithm::PerceptronClassifier, + observations::PerceptronClassifierObservations, +) = observations.X + +# Note that data consumed by `predict` needs no pre-processing, so no need to overload +# `obs(model, data)`. + + +# ### Fitting and updating + +# For wrapping outcomes of learning: +struct PerceptronClassifierFitted + algorithm::PerceptronClassifier + perceptron # component array storing weights and bias + state # optimiser state + classes # target classes + losses +end + +LearnAPI.algorithm(model::PerceptronClassifierFitted) = model.algorithm + +# `fit` for pre-processed data (output of `obs(algorithm, data)`): +function LearnAPI.fit( + algorithm::PerceptronClassifier, + observations::PerceptronClassifierObservations; + verbosity=1, + ) + + # unpack hyperparameters: + epochs = algorithm.epochs + optimiser = algorithm.optimiser + rng = deepcopy(algorithm.rng) # to prevent mutation of `algorithm`! + + # unpack data: + X = observations.X + y_hot = observations.y_hot + classes = observations.classes + nclasses = length(classes) + + # initialize bias and weights: + weights = randn(rng, Float32, nclasses, p) + bias = zeros(Float32, nclasses) + perceptron = (; weights, bias) |> ComponentArrays.ComponentArray + + # initialize optimiser: + state = Optimisers.setup(optimiser, perceptron) + + perceptron, state, losses = corefit(perceptron, X, y_hot, epochs, state, verbosity) + + return PerceptronClassifierFitted(algorithm, perceptron, state, classes, losses) +end + +# `fit` for unprocessed data: +LearnAPI.fit(algorithm::PerceptronClassifier, data; kwargs...) = + fit(algorithm, obs(algorithm, data); kwargs...) + +# see the `PerceptronClassifier` docstring for `update_observations` logic. +function LearnAPI.update_observations( + model::PerceptronClassifierFitted, + observations_new::PerceptronClassifierObservations; + verbosity=1, + replacements..., + ) + + # unpack data: + X = observations.X + y_hot = observations.y_hot + classes = observations.classes + nclasses = length(classes) + + classes == model.classes || error("New training target has incompatible classes.") + + algorithm_old = LearnAPI.algorithm(model) + algorithm = LearnAPI.clone(algorithm_old; replacements...) + + perceptron = model.perceptron + state = model.state + losses = model.losses + epochs = algorithm.epochs + + perceptron, state, losses_new = corefit(perceptron, X, y_hot, epochs, state, verbosity) + losses = vcat(losses, losses_new) + + return PerceptronClassifierFitted(algorithm, perceptron, state, classes, losses) +end +LearnAPI.update_observations(model::PerceptronClassifierFitted, data; kwargs...) = + update_observations(model, obs(LearnAPI.algorithm(model), data); kwargs...) + +# see the `PerceptronClassifier` docstring for `update` logic. +function LearnAPI.update( + model::PerceptronClassifierFitted, + observations::PerceptronClassifierObservations; + verbosity=1, + replacements..., + ) + + # unpack data: + X = observations.X + y_hot = observations.y_hot + classes = observations.classes + nclasses = length(classes) + + classes == model.classes || error("New training target has incompatible classes.") + + algorithm_old = LearnAPI.algorithm(model) + algorithm = LearnAPI.clone(algorithm_old; replacements...) + :epochs in keys(replacements) || return fit(algorithm, observations) + + perceptron = model.perceptron + state = model.state + losses = model.losses + + epochs = algorithm.epochs + Δepochs = epochs - algorithm_old.epochs + epochs < 0 && return fit(model, algorithm) + + perceptron, state, losses_new = corefit(perceptron, X, y_hot, Δepochs, state, verbosity) + losses = vcat(losses, losses_new) + + return PerceptronClassifierFitted(algorithm, perceptron, state, classes, losses) +end +LearnAPI.update(model::PerceptronClassifierFitted, data; kwargs...) = + update(model, obs(LearnAPI.algorithm(model), data); kwargs...) + + +# ### Predict + +function LearnAPI.predict(model::PerceptronClassifierFitted, ::Distribution, Xnew) + perceptron = model.perceptron + classes = model.classes + probs = perceptron.weights*Xnew .+ perceptron.bias |> NNlib.softmax + return CategoricalDistributions.UnivariateFinite(classes, probs') +end + +LearnAPI.predict(model::PerceptronClassifierFitted, ::Point, Xnew) = + mode.(predict(model, Distribution(), Xnew)) + + +# ### Accessor functions + +LearnAPI.training_losses(model::PerceptronClassifierFitted) = model.losses + + +# ### Traits + +@trait( + PerceptronClassifier, + constructor = PerceptronClassifier, + iteration_parameter = :epochs, + kinds_of_proxy = (Distribution(), Point()), + tags = ("classification", "iterative algorithms", "incremental algorithms"), + functions = ( + :(LearnAPI.fit), + :(LearnAPI.algorithm), + :(LearnAPI.minimize), + :(LearnAPI.obs), + :(LearnAPI.features), + :(LearnAPI.target), + :(LearnAPI.update), + :(LearnAPI.update_observations), + :(LearnAPI.predict), + :(LearnAPI.training_losses), + ) +) + + +# ## Tests + +# synthetic test data: +N = 10 +n = 10N # number of observations +p = 2 # number of features +train = 1:6N +test = (6N+1:10N) +rng = StableRNG(123) +X = randn(rng, Float32, p, n); +coefficients = rand(rng, Float32, p)' +y_continuous = coefficients*X |> vec +η1 = quantile(y_continuous, 1/3) +η2 = quantile(y_continuous, 2/3) +y = map(y_continuous) do η + η < η1 && return "A" + η < η2 && return "B" + "C" +end |> CategoricalDistributions.categorical; +Xtrain = X[:, train]; +Xtest = X[:, test]; +ytrain = y[train]; +ytest = y[test]; + +@testset "PerceptronClassfier" begin + rng = StableRNG(123) + algorithm = PerceptronClassifier(; optimiser=Optimisers.Adam(0.01), epochs=40, rng) + @test LearnAPI.clone(algorithm) == algorithm + @test :(LearnAPI.update) in LearnAPI.functions(algorithm) + @test LearnAPI.target(algorithm, (X, y)) == y + @test LearnAPI.features(algorithm, (X, y)) == X + + model40 = fit(algorithm, Xtrain, ytrain; verbosity=0) + + # 40 epochs is sufficient for 90% accuracy in this case: + @test sum(predict(model40, Point(), Xtest) .== ytest)/length(ytest) > 0.9 + + # get probabilistic predictions: + ŷ40 = predict(model40, Distribution(), Xtest); + @test predict(model40, Xtest) ≈ ŷ40 + + # add 30 epochs in an `update`: + model70 = update(model40, Xtrain, y[train]; verbosity=0, epochs=70) + ŷ70 = predict(model70, Xtest); + @test !(ŷ70 ≈ ŷ40) + + # compare with cold restart: + model = fit(LearnAPI.clone(algorithm; epochs=70), Xtrain, y[train]; verbosity=0); + @test ŷ70 ≈ predict(model, Xtest) + + # instead add 30 epochs using `update_observations` instead: + model70b = update_observations(model40, Xtrain, y[train]; verbosity=0, epochs=30) + @test ŷ70 ≈ predict(model70b, Xtest) ≈ predict(model, Xtest) +end + +true From 09290666b9ab1e8d0c4e972b949ab61f2e386f5c Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 11 Oct 2024 14:22:05 +1300 Subject: [PATCH 2/4] update roadmap --- ROADMAP.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ROADMAP.md b/ROADMAP.md index cd524ba5..7da578ae 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -16,7 +16,7 @@ - [x] regression - [ ] classification - [ ] clustering - - [ ] gradient descent + - [x] gradient descent - [x] iterative algorithms - [ ] incremental algorithms - [ ] dimension reduction From 671491c8db8078ddacaf65ccf32ceaaf82a694a9 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 11 Oct 2024 14:31:09 +1300 Subject: [PATCH 3/4] fix outdated file reference in tests --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index dee0c17b..2c66588d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,7 +6,7 @@ test_files = [ "clone.jl", "integration/regression.jl", "integration/static_algorithms.jl", - "integration/iterative_algorithms.jl", + "integration/ensembling.jl", ] files = isempty(ARGS) ? test_files : ARGS From 458b03cb12867632fdcd7aca8da0229e62c22d88 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 11 Oct 2024 14:52:36 +1300 Subject: [PATCH 4/4] fix flawed test --- test/integration/ensembling.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/integration/ensembling.jl b/test/integration/ensembling.jl index 1e92e203..657f0d29 100644 --- a/test/integration/ensembling.jl +++ b/test/integration/ensembling.jl @@ -194,13 +194,14 @@ Xtest = Tables.subset(X, test) ŷ7 = predict(model, Xtest) # compare with cold restart: - model = fit(LearnAPI.clone(algorithm; n=7), Xtrain, y[train]; verbosity=0); - @test ŷ7 ≈ predict(model, Xtest) + model_cold = fit(LearnAPI.clone(algorithm; n=7), Xtrain, y[train]; verbosity=0); + @test ŷ7 ≈ predict(model_cold, Xtest) - # test cold restart if another hyperparameter is changed: + # test that we get a cold restart if another hyperparameter is changed: model2 = update(model, Xtrain, y[train]; atom=Ridge(0.05)) - algorithm2 = LearnAPI.clone(LearnAPI.algorithm(model); atom=Ridge(0.05)) - @test predict(model, Xtest) ≈ predict(model2, Xtest) + algorithm2 = Ensemble(Ridge(0.05); n=7, rng) + model_cold = fit(algorithm2, Xtrain, y[train]; verbosity=0) + @test predict(model2, Xtest) ≈ predict(model_cold, Xtest) end