Skip to content

Commit 1530a84

Browse files
committed
add iterative algorithms to integration tests
1 parent 72009e2 commit 1530a84

File tree

2 files changed

+347
-1
lines changed

2 files changed

+347
-1
lines changed

Project.toml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,20 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1515
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
1616
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1717
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
18+
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
1819
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1920
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
2021
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2122

2223
[targets]
23-
test = ["DataFrames", "LinearAlgebra", "MLUtils", "Random", "Serialization", "Statistics", "Tables", "Test"]
24+
test = [
25+
"DataFrames",
26+
"LinearAlgebra",
27+
"MLUtils",
28+
"Random",
29+
"Serialization",
30+
"StableRNGs",
31+
"Statistics",
32+
"Tables",
33+
"Test",
34+
]
Lines changed: 335 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
using LearnAPI
2+
using LinearAlgebra
3+
using Tables
4+
import MLUtils
5+
import DataFrames
6+
using Random
7+
using Statistics
8+
using StableRNGs
9+
10+
# # ENSEMBLE OF RIDGE REGRESSORS
11+
12+
# We implement a toy algorithm that creates an bagged ensemble of ridge regressors (as
13+
# defined already in test/integration/regressors.jl), i.e, where each atomic model is
14+
# trained on a random sample of the training observations (same number, but sampled with
15+
# replacement). In particular this algorithm has an iteration parameter `n`, and we
16+
# implement `update` for warm restarts when `n` increases.
17+
18+
# By re-using the data interface for `Ridge`, we ensure that the resampling (bagging) is
19+
# more efficient (no repeated table -> matrix conversions, and we resample matrices
20+
# directly, not the tables).
21+
22+
# no docstring here - that goes with the constructor
23+
struct RidgeEnsemble
24+
lambda::Float64
25+
rng # leaving abstract for simplicity
26+
n::Int
27+
end
28+
29+
"""
30+
RidgeEnsemble(; lambda=0.1, rng=Random.default_rng(), n=10)
31+
32+
Instantiate a RidgeEnsemble algorithm, bla, bla, bla...
33+
34+
"""
35+
RidgeEnsemble(; lambda=0.1, rng=Random.default_rng(), n=10) =
36+
RidgeEnsemble(lambda, rng, n) # LearnAPI.constructor defined later
37+
38+
struct RidgeEnsembleFitted
39+
algorithm::RidgeEnsemble
40+
atom::Ridge
41+
rng # mutated copy of `algorithm.rng`
42+
models # leaving type abstract for simplicity
43+
end
44+
45+
LearnAPI.algorithm(model::RidgeEnsembleFitted) = model.algorithm
46+
47+
# we use the same data interface we provided for `Ridge` in regression.jl:
48+
LearnAPI.obs(algorithm::RidgeEnsemble, data) = LearnAPI.obs(Ridge(), data)
49+
LearnAPI.obs(model::RidgeEnsembleFitted, data) = LearnAPI.obs(first(model.models), data)
50+
LearnAPI.target(algorithm::RidgeEnsemble, data) = LearnAPI.target(Ridge(), data)
51+
LearnAPI.features(algorithm::Ridge, data) = LearnAPI.features(Ridge(), data)
52+
53+
function d(rng)
54+
i = digits(rng.state)
55+
m = min(length(i), 4)
56+
tail = i[end - m + 1:end]
57+
println(join(string.(tail)))
58+
end
59+
60+
# because we need observation subsampling, we first implement `fit` for output of
61+
# `obs`:
62+
function LearnAPI.fit(algorithm::RidgeEnsemble, data::RidgeFitObs; verbosity=1)
63+
64+
# unpack hyperparameters:
65+
lambda = algorithm.lambda
66+
rng = deepcopy(algorithm.rng) # to prevent mutation of `algorithm`
67+
n = algorithm.n
68+
69+
# instantiate atomic algorithm:
70+
atom = Ridge(lambda)
71+
72+
# initialize ensemble:
73+
models = []
74+
75+
# get number of observations:
76+
N = MLUtils.numobs(data)
77+
78+
# train the ensemble:
79+
for _ in 1:n
80+
bag = rand(rng, 1:N, N)
81+
data_subset = MLUtils.getobs(data, bag)
82+
# step down one verbosity level in atomic fit:
83+
model = fit(atom, data_subset; verbosity=verbosity - 1)
84+
push!(models, model)
85+
end
86+
87+
# make some noise, if allowed:
88+
verbosity > 0 && @info "Trained $n ridge regression models. "
89+
90+
return RidgeEnsembleFitted(algorithm, atom, rng, models)
91+
92+
end
93+
94+
# ... and so need a `fit` for unprocessed `data = (X, y)`:
95+
LearnAPI.fit(algorithm::RidgeEnsemble, data; kwargs...) =
96+
fit(algorithm, obs(algorithm, data); kwargs...)
97+
98+
# If `n` is increased, this `update` adds new regressors to the ensemble, including any
99+
# new # hyperparameter updates (e.g, `lambda`) when computing the new
100+
# regressors. Otherwise, update is equivalent to retraining from scratch, with the
101+
# provided hyperparameter updates.
102+
function LearnAPI.update(
103+
model::RidgeEnsembleFitted,
104+
data::RidgeFitObs;
105+
verbosity=1,
106+
replacements...,
107+
)
108+
109+
:n in keys(replacements) || return fit(model, data)
110+
111+
algorithm_old = LearnAPI.algorithm(model)
112+
algorithm = LearnAPI.clone(algorithm_old; replacements...)
113+
n = algorithm.n
114+
Δn = n - algorithm_old.n
115+
n < 0 && return fit(model, algorithm)
116+
117+
# get number of observations:
118+
N = MLUtils.numobs(data)
119+
120+
# initialize:
121+
models = model.models
122+
rng = model.rng # as mutated in previous `fit`/`update` calls
123+
124+
atom = Ridge(; lambda=algorithm.lambda)
125+
126+
rng2 = StableRNG(123)
127+
for _ in 1:10
128+
rand(rng2)
129+
end
130+
131+
# add new regressors to the ensemble:
132+
for _ in 1:Δn
133+
bag = rand(rng, 1:N, N)
134+
data_subset = MLUtils.getobs(data, bag)
135+
model = fit(atom, data_subset; verbosity=verbosity-1)
136+
push!(models, model)
137+
end
138+
139+
# make some noise, if allowed:
140+
verbosity > 0 && @info "Trained $Δn additional ridge regression models. "
141+
142+
return RidgeEnsembleFitted(algorithm, atom, rng, models)
143+
end
144+
145+
# an `update` for unprocessed `data = (X, y)`:
146+
LearnAPI.update(model::RidgeEnsembleFitted, data; kwargs...) =
147+
update(model, obs(LearnAPI.algorithm(model), data); kwargs...)
148+
149+
# `data` here can be pre-processed or not, because we're just calling the atomic
150+
# `predict`, which already has a data interface, and we don't need any subsampling, like
151+
# we did for `fit`:
152+
LearnAPI.predict(model::RidgeEnsembleFitted, ::Point, data) =
153+
mean(model.models) do atomic_model
154+
predict(atomic_model, Point(), data)
155+
end
156+
157+
LearnAPI.minimize(model::RidgeEnsembleFitted) = RidgeEnsembleFitted(
158+
model.algorithm,
159+
model.atom,
160+
model.rng,
161+
minimize.(Ref(model.atom), models),
162+
)
163+
164+
# note the inclusion of `iteration_parameter`:
165+
@trait(
166+
RidgeEnsemble,
167+
constructor = RidgeEnsemble,
168+
iteration_parameter = :n,
169+
kinds_of_proxy = (Point(),),
170+
tags = ("regression", "ensemble algorithms", "iterative models"),
171+
functions = (
172+
:(LearnAPI.fit),
173+
:(LearnAPI.algorithm),
174+
:(LearnAPI.minimize),
175+
:(LearnAPI.obs),
176+
:(LearnAPI.features),
177+
:(LearnAPI.target),
178+
:(LearnAPI.update),
179+
:(LearnAPI.predict),
180+
:(LearnAPI.feature_importances),
181+
)
182+
)
183+
184+
# synthetic test data:
185+
N = 10 # number of observations
186+
train = 1:6
187+
test = 7:10
188+
a, b, c = rand(N), rand(N), rand(N)
189+
X = (; a, b, c)
190+
X = DataFrames.DataFrame(X)
191+
y = 2a - b + 3c + 0.05*rand(N)
192+
data = (X, y)
193+
Xtrain = Tables.subset(X, train)
194+
Xtest = Tables.subset(X, test)
195+
196+
@testset "test an implementation of bagged ensemble of ridge regressors" begin
197+
rng = StableRNG(123)
198+
algorithm = RidgeEnsemble(lambda=0.5, n=4; rng)
199+
@test LearnAPI.clone(algorithm) == algorithm
200+
@test :(LearnAPI.obs) in LearnAPI.functions(algorithm)
201+
@test LearnAPI.target(algorithm, data) == y
202+
@test LearnAPI.features(algorithm, data) == X
203+
204+
model = @test_logs(
205+
(:info, r"Trained 4 ridge"),
206+
fit(algorithm, Xtrain, y[train]; verbosity=1),
207+
);
208+
209+
ŷ4 = predict(model, Point(), Xtest)
210+
@test ŷ4 == predict(model, Xtest)
211+
212+
# add 3 atomic models to the ensemble:
213+
# model = @test_logs(
214+
# (:info, r"Trained 3 additional"),
215+
# update(model, Xtrain, y[train]; n=7),
216+
# )
217+
model = update(model, Xtrain, y[train]; verbosity=0, n=7);
218+
ŷ7 = predict(model, Xtest)
219+
220+
# compare with cold restart:
221+
model = fit(LearnAPI.clone(algorithm; n=7), Xtrain, y[train]; verbosity=0);
222+
@test ŷ7 predict(model, Xtest)
223+
224+
225+
update(model, Xtest;
226+
fitobs = LearnAPI.obs(algorithm, data)
227+
predictobs = LearnAPI.obs(model, X)
228+
model = fit(algorithm, MLUtils.getobs(fitobs, train); verbosity=0)
229+
@test LearnAPI.target(algorithm, fitobs) == y
230+
@test predict(model, Point(), MLUtils.getobs(predictobs, test))
231+
@test predict(model, LearnAPI.features(algorithm, fitobs)) predict(model, X)
232+
233+
@test LearnAPI.feature_importances(model) isa Vector{<:Pair{Symbol}}
234+
235+
filename = tempname()
236+
using Serialization
237+
small_model = minimize(model)
238+
serialize(filename, small_model)
239+
240+
recovered_model = deserialize(filename)
241+
@test LearnAPI.algorithm(recovered_model) == algorithm
242+
@test predict(
243+
recovered_model,
244+
Point(),
245+
MLUtils.getobs(predictobs, test)
246+
)
247+
248+
end
249+
250+
# # VARIATION OF RIDGE REGRESSION THAT USES FALLBACK OF LearnAPI.obs
251+
252+
# no docstring here - that goes with the constructor
253+
struct BabyRidge
254+
lambda::Float64
255+
end
256+
257+
"""
258+
BabyRidge(; lambda=0.1)
259+
260+
Instantiate a ridge regression algorithm, with regularization of `lambda`.
261+
262+
"""
263+
BabyRidge(; lambda=0.1) = BabyRidge(lambda) # LearnAPI.constructor defined later
264+
265+
struct BabyRidgeFitted{T,F}
266+
algorithm::BabyRidge
267+
coefficients::Vector{T}
268+
feature_importances::F
269+
end
270+
271+
function LearnAPI.fit(algorithm::BabyRidge, data; verbosity=1)
272+
273+
X, y = data
274+
275+
lambda = algorithm.lambda
276+
table = Tables.columntable(X)
277+
names = Tables.columnnames(table) |> collect
278+
A = Tables.matrix(table)'
279+
280+
# apply core algorithm:
281+
coefficients = (A*A' + algorithm.lambda*I)\(A*y) # vector
282+
283+
feature_importances = nothing
284+
285+
return BabyRidgeFitted(algorithm, coefficients, feature_importances)
286+
287+
end
288+
289+
# extracting stuff from training data:
290+
LearnAPI.target(::BabyRidge, data) = last(data)
291+
292+
LearnAPI.algorithm(model::BabyRidgeFitted) = model.algorithm
293+
294+
LearnAPI.predict(model::BabyRidgeFitted, ::Point, Xnew) =
295+
Tables.matrix(Xnew)*model.coefficients
296+
297+
LearnAPI.minimize(model::BabyRidgeFitted) =
298+
BabyRidgeFitted(model.algorithm, model.coefficients, nothing)
299+
300+
@trait(
301+
BabyRidge,
302+
constructor = BabyRidge,
303+
kinds_of_proxy = (Point(),),
304+
tags = ("regression",),
305+
functions = (
306+
:(LearnAPI.fit),
307+
:(LearnAPI.algorithm),
308+
:(LearnAPI.minimize),
309+
:(LearnAPI.obs),
310+
:(LearnAPI.features),
311+
:(LearnAPI.target),
312+
:(LearnAPI.predict),
313+
:(LearnAPI.feature_importances),
314+
)
315+
)
316+
317+
@testset "test a variation which does not overload LearnAPI.obs" begin
318+
algorithm = BabyRidge(lambda=0.5)
319+
@test
320+
321+
model = fit(algorithm, Tables.subset(X, train), y[train]; verbosity=0)
322+
= predict(model, Point(), Tables.subset(X, test))
323+
@testisa Vector{Float64}
324+
325+
fitobs = obs(algorithm, data)
326+
predictobs = LearnAPI.obs(model, X)
327+
model = fit(algorithm, MLUtils.getobs(fitobs, train); verbosity=0)
328+
@test predict(model, Point(), MLUtils.getobs(predictobs, test)) ====
329+
predict(model, MLUtils.getobs(predictobs, test))
330+
@test LearnAPI.target(algorithm, data) == y
331+
@test LearnAPI.predict(model, X)
332+
LearnAPI.predict(model, LearnAPI.features(algorithm, data))
333+
end
334+
335+
true

0 commit comments

Comments
 (0)