From 43c28ec10062c8d437c658466a1618b195407dc6 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 12 Nov 2025 09:57:02 +1300 Subject: [PATCH 1/5] extend DeterministicConstantRegressor to multitargets --- src/builtins/Constant.jl | 91 +++++++++++++++++++++++++++++++++++++-- test/builtins/Constant.jl | 39 +++++++++++++++-- 2 files changed, 123 insertions(+), 7 deletions(-) diff --git a/src/builtins/Constant.jl b/src/builtins/Constant.jl index e546307..84fceaa 100644 --- a/src/builtins/Constant.jl +++ b/src/builtins/Constant.jl @@ -24,23 +24,38 @@ MLJModelInterface.predict(::ConstantRegressor, fitresult, Xnew) = fill(fitresult, nrows(Xnew)) ## -## THE CONSTANT DETERMINISTIC REGRESSOR (FOR TESTING) +## THE CONSTANT DETERMINISTIC REGRESSOR ## +# helpers: +_mean(y) = _mean(y, scitype(y)) +_mean(y, ::Type{<:AbstractArray}) = mean(y, dims=1) +_mean(y, ::Type{<:Table}) = _mean(Tables.matrix(y), AbstractArray) +_materializer(y) = _materializer(y, scitype(y)) +_materializer(y, ::Type{<:AbstractMatrix}) = identity +_materializer(y, ::Type{<:AbstractVector}) = vec +_materializer(y, ::Type{<:Table}) = Tables.materializer(y)∘Tables.table + struct DeterministicConstantRegressor <: Deterministic end function MLJModelInterface.fit(::DeterministicConstantRegressor, verbosity::Int, X, y) - fitresult = mean(y) + μ = _mean(y) + materializer = _materializer(y) + fitresult = (; μ, materializer) cache = nothing report = NamedTuple() return fitresult, cache, report end MLJModelInterface.predict(::DeterministicConstantRegressor, fitresult, Xnew) = - fill(fitresult, nrows(Xnew)) + hcat([fill(fitresult.μ[i], nrows(Xnew)) for i in eachindex(fitresult.μ)]...) |> + fitresult.materializer + +MLJModelInterface.fitted_params(model::DeterministicConstantRegressor, fitresult) = + (; mean=fitresult.μ) ## ## THE CONSTANT CLASSIFIER @@ -115,7 +130,11 @@ metadata_model( metadata_model( DeterministicConstantRegressor, input_scitype = Table, - target_scitype = AbstractVector{Continuous}, + target_scitype = Union{ + AbstractMatrix{Continuous}, + AbstractVector{Continuous}, + Table, + }, supports_weights = false, load_path = "MLJModels.DeterministicConstantRegressor" ) @@ -150,6 +169,9 @@ mean or median values instead. If not specified, a normal distribution is fit. Almost any reasonable model is expected to outperform `ConstantRegressor` which is used almost exclusively for testing and establishing performance baselines. +If you need a multitarget dummy regressor, consider using `DeterministicConstantRegressor` +instead. + In MLJ (or MLJModels) do `model = ConstantRegressor()` or `model = ConstantRegressor(distribution=...)` to construct a model instance. @@ -211,6 +233,67 @@ See also """ ConstantRegressor +""" + DeterministicConstantRegressor + +This "dummy" predictor always makes the same prediction, irrespective of the provided +input pattern, namely the mean value of the training target values. (It's counterpart, +`ConstantRegressor` makes probabilistic predictions.) This model handles mutlitargets, +i.e, the training target can be a matrix or a table (observations the rows). + +Almost any reasonable model is expected to outperform `DeterministicConstantRegressor` +which is used almost exclusively for testing and establishing performance baselines. + +In MLJ, do `model = DeterministicConstantRegressor()` to construct a model instance. + + +# Training data + +In MLJ (or MLJBase) bind an instance `model` to data with + + mach = machine(model, X, y) + +Here: + +- `X` is any table of input features (eg, a `DataFrame`) + +- `y` is the target, which can be any `AbstractVector`, `AbstractVector` or table whose + element scitype is `Continuous`; check the scitype `scitype(y)` or, for tables, with + `schema(y)` + +Train the machine using `fit!(mach, rows=...)`. + +# Operations + +- `predict(mach, Xnew)`: Return predictions of the target given + features `Xnew` (which for this model are ignored). + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `mean`: The target mean(s). Always a row vector. + +# Examples + +```julia +using MLJ + +X, y = make_regression(10, 2; n_targets=3) # synthetic data: two tables +regressor = DeterministicConstantRegressor() +mach = machine(regressor, X, y) |> fit! + +fitted_params(mach) + +Xnew, _ = make_regression(3, 2) +predict(mach, Xnew) + +``` +See also +[`ConstantClassifier`](@ref) +""" +DeterministicConstantRegressor + """ ConstantClassifier diff --git a/test/builtins/Constant.jl b/test/builtins/Constant.jl index 2636abd..59d136b 100644 --- a/test/builtins/Constant.jl +++ b/test/builtins/Constant.jl @@ -1,12 +1,13 @@ module TestConstant using Test, MLJModels, CategoricalArrays -import Distributions, MLJBase +import Distributions, MLJBase, Tables + # Any X will do for constant models: X = NamedTuple{(:x1,:x2,:x3)}((rand(10), rand(10), rand(10))) -@testset "Regressor" begin +@testset "ConstantRegressor" begin y = [1.0, 1.0, 2.0, 2.0] model = ConstantRegressor(distribution_type= @@ -25,7 +26,39 @@ X = NamedTuple{(:x1,:x2,:x3)}((rand(10), rand(10), rand(10))) @test MLJBase.load_path(model) == "MLJModels.ConstantRegressor" end -@testset "Classifier" begin +@testset "DeterministicConstantRegressor" begin + + X = (; x=ones(3)) + S = MLJBase.target_scitype(DeterministicConstantRegressor()) + + # vector target: + y = Float64[2, 3, 4] + @test MLJBase.scitype(y) <: S + mach = MLJBase.machine(MLJModels.DeterministicConstantRegressor(), X, y) + MLJBase.fit!(mach, verbosity=0) + @test MLJBase.predict(mach, X) ≈ [3, 3, 3] + @test only(MLJBase.fitted_params(mach).mean) ≈ 3 + + # matrix target: + y = Float64[2 5; 3 6; 4 7] + @test MLJBase.scitype(y) <: S + mach = MLJBase.machine(MLJModels.DeterministicConstantRegressor(), X, y) + MLJBase.fit!(mach, verbosity=0) + @test MLJBase.predict(mach, X) ≈ [3 6; 3 6; 3 6] + @test MLJBase.fitted_params(mach).mean ≈ [3 6] + + # tabular target: + y = Float64[2 5; 3 6; 4 7] |> Tables.table |> Tables.rowtable + @test MLJBase.scitype(y) <: S + mach = MLJBase.machine(MLJModels.DeterministicConstantRegressor(), X, y) + MLJBase.fit!(mach, verbosity=0) + yhat = MLJBase.predict(mach, X) + @test yhat isa Vector{<:NamedTuple} + @test Tables.matrix(yhat) ≈ [3 6; 3 6; 3 6] + @test MLJBase.fitted_params(mach).mean ≈ [3 6] +end + +@testset "ConstantClassifier" begin yraw = ["Perry", "Antonia", "Perry", "Skater"] y = categorical(yraw) From 36ad9b2a834dcb7d0c39cd87844e0601b67e86b9 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 20 Nov 2025 11:31:48 +1300 Subject: [PATCH 2/5] ensure predicted multitargets have the right column names --- src/builtins/Constant.jl | 11 +++++++---- test/builtins/Constant.jl | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/builtins/Constant.jl b/src/builtins/Constant.jl index 84fceaa..52b8f80 100644 --- a/src/builtins/Constant.jl +++ b/src/builtins/Constant.jl @@ -34,7 +34,10 @@ _mean(y, ::Type{<:Table}) = _mean(Tables.matrix(y), AbstractArray) _materializer(y) = _materializer(y, scitype(y)) _materializer(y, ::Type{<:AbstractMatrix}) = identity _materializer(y, ::Type{<:AbstractVector}) = vec -_materializer(y, ::Type{<:Table}) = Tables.materializer(y)∘Tables.table +function _materializer(y, ::Type{<:Table}) + names = Tables.columnnames(Tables.columntable(y)) + Tables.materializer(y)∘(matrix->Tables.table(matrix; header=names)) +end struct DeterministicConstantRegressor <: Deterministic end @@ -279,9 +282,9 @@ The fields of `fitted_params(mach)` are: ```julia using MLJ -X, y = make_regression(10, 2; n_targets=3) # synthetic data: two tables -regressor = DeterministicConstantRegressor() -mach = machine(regressor, X, y) |> fit! +X, y = make_regression(10, 2; n_targets=3); # synthetic data: two tables +regressor = DeterministicConstantRegressor(); +mach = machine(regressor, X, y) |> fit!; fitted_params(mach) diff --git a/test/builtins/Constant.jl b/test/builtins/Constant.jl index 59d136b..23eebee 100644 --- a/test/builtins/Constant.jl +++ b/test/builtins/Constant.jl @@ -48,12 +48,13 @@ end @test MLJBase.fitted_params(mach).mean ≈ [3 6] # tabular target: - y = Float64[2 5; 3 6; 4 7] |> Tables.table |> Tables.rowtable + y = Tables.table(Float64[2 5; 3 6; 4 7], header=[:x, :y]) |> Tables.rowtable @test MLJBase.scitype(y) <: S mach = MLJBase.machine(MLJModels.DeterministicConstantRegressor(), X, y) MLJBase.fit!(mach, verbosity=0) yhat = MLJBase.predict(mach, X) @test yhat isa Vector{<:NamedTuple} + @test keys(yhat[1]) == (:x, :y) @test Tables.matrix(yhat) ≈ [3 6; 3 6; 3 6] @test MLJBase.fitted_params(mach).mean ≈ [3 6] end From c39e537875602f1055114a434d375afbffe5f7b6 Mon Sep 17 00:00:00 2001 From: "Anthony Blaom, PhD" Date: Thu, 20 Nov 2025 11:34:43 +1300 Subject: [PATCH 3/5] Update src/builtins/Constant.jl Co-authored-by: Okon Samuel <39421418+OkonSamuel@users.noreply.github.com> --- src/builtins/Constant.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/builtins/Constant.jl b/src/builtins/Constant.jl index 52b8f80..882e997 100644 --- a/src/builtins/Constant.jl +++ b/src/builtins/Constant.jl @@ -275,7 +275,7 @@ Train the machine using `fit!(mach, rows=...)`. The fields of `fitted_params(mach)` are: -- `mean`: The target mean(s). Always a row vector. +- `mean`: The target mean(s). Always a row vector. (i.e an `AbstractMatrix` object with row dim 1) # Examples From 38d49dc5bc8cfcb9656c2bd71016e2f80ce1a71e Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 24 Nov 2025 11:06:59 +1300 Subject: [PATCH 4/5] clearer doc string --- src/builtins/Constant.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/builtins/Constant.jl b/src/builtins/Constant.jl index 52b8f80..3f75412 100644 --- a/src/builtins/Constant.jl +++ b/src/builtins/Constant.jl @@ -242,7 +242,7 @@ ConstantRegressor This "dummy" predictor always makes the same prediction, irrespective of the provided input pattern, namely the mean value of the training target values. (It's counterpart, `ConstantRegressor` makes probabilistic predictions.) This model handles mutlitargets, -i.e, the training target can be a matrix or a table (observations the rows). +i.e, the training target can be a matrix or a table (with rows as observations). Almost any reasonable model is expected to outperform `DeterministicConstantRegressor` which is used almost exclusively for testing and establishing performance baselines. From 2d4d1c11997bd91dccfe78a92cef0b162999d762 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 24 Nov 2025 11:08:39 +1300 Subject: [PATCH 5/5] bump 0.18.3 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 315d615..aeedc6a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJModels" uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7" authors = ["Anthony D. Blaom "] -version = "0.18.2" +version = "0.18.3" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"