first pass at Gramian training for OLS (#146)

adienes · web-flow · commit 14eb46619bfa · 2023-10-02T23:09:12.000+02:00
* proof of concept

* AbstractMatrix -&gt; AVR

* cleaner impl

* endline

* fix error type

* construct kernels if not passed in

* add test case for implicit gram construction

* last endline

* check for isempty instead of iszero
diff --git a/src/fit/default.jl b/src/fit/default.jl
@@ -33,13 +33,22 @@ $SIGNATURES
 Fit a generalised linear regression model using an appropriate solver based on
 the loss and penalty of the model. A method can, in some cases, be specified.
 """
-function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR;
+function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing,
              solver::Solver=_solver(glr, size(X)))
-    check_nrows(X, y)
-    n, p = size(X)
-    c = getc(glr, y)
-    return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
+    if hasproperty(solver, :gram) && solver.gram
+        # interpret X,y as X'X, X'y
+        data = verify_or_construct_gramian(glr, X, y, data)
+        p = size(data.XX, 2)
+        return _fit(glr, solver, data.XX, data.Xy, (; dims=(data.n, p, 0)))
+    else
+        check_nrows(X, y)
+        n, p = size(X)
+        c = getc(glr, y)
+        return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
+    end
 end
+fit(glr::GLR; kwargs...) = fit(glr, zeros((0,0)), zeros((0,)); kwargs...)
+
 
 function scratch(n, p, c=0; i=false)
     p_ = p + Int(i)
diff --git a/src/fit/proxgrad.jl b/src/fit/proxgrad.jl
@@ -3,7 +3,7 @@
 # Assumption: loss has gradient; penalty has prox e.g.: Lasso
 # J(θ) = f(θ) + r(θ) where f is smooth
 function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
-    _,p,c = npc(scratch)
+    n,p,c = npc(scratch)
     c > 0 && (p *= c)
     # vector caches + eval cache
     θ   = zeros(p)   # θ_k
@@ -19,9 +19,18 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
     η   = 1.0   # stepsize (1/L)
     acc = ifelse(solver.accel, 1.0, 0.0) # if 0, no extrapolation (ISTA)
     # functions
-    _f      = smooth_objective(glr, X, y; c=c)
-    _fg!    = smooth_fg!(glr, X, y, scratch)
-    _prox!  = prox!(glr, size(X, 1))
+    _f = if solver.gram
+        smooth_gram_objective(glr, X, y, n)
+    else
+        smooth_objective(glr, X, y; c=c)
+    end
+
+    _fg! = if solver.gram
+        smooth_gram_fg!(glr, X, y, n)
+    else
+        smooth_fg!(glr, X, y, scratch)
+    end
+    _prox!  = prox!(glr, n)
     bt_cond = θ̂ ->
                 _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (2η)
     # loop-related
diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl
@@ -133,6 +133,7 @@ Proximal Gradient solver for non-smooth objective functions.
     tol::Float64   = 1e-4  # tol relative change of θ i.e. norm(θ-θ_)/norm(θ)
     max_inner::Int = 100   # β^max_inner should be > 1e-10
     beta::Float64  = 0.8   # in (0, 1); shrinkage in the backtracking step
+    gram::Bool = false     # use precomputed Gramian for lsq where possible
 end
 
 FISTA(; kwa...) = ProxGrad(;accel = true, kwa...)
diff --git a/src/glr/d_l2loss.jl b/src/glr/d_l2loss.jl
@@ -72,3 +72,12 @@ function smooth_fg!(glr::GLR{L2Loss,<:ENR}, X, y, scratch)
         return glr.loss(r) + get_l2(glr.penalty)(view_θ(glr, θ))
     end
 end
+
+function smooth_gram_fg!(glr::GLR{L2Loss,<:ENR}, XX, Xy, n)
+    λ = get_penalty_scale_l2(glr, n)
+    (g, θ) -> begin
+        _g = XX * θ .- Xy
+        g .= _g .+ λ .* θ
+        return θ'*_g + get_l2(glr.penalty)(view_θ(glr, θ))
+    end
+end
diff --git a/src/glr/utils.jl b/src/glr/utils.jl
@@ -1,4 +1,4 @@
-export objective, smooth_objective
+export objective, smooth_objective, smooth_gram_objective
 
 # NOTE: RobustLoss are not always everywhere  smooth but "smooth-enough".
 const SmoothLoss = Union{L2Loss, LogisticLoss, MultinomialLoss, RobustLoss}
@@ -37,6 +37,9 @@ Return the smooth part of the objective function of a GLR.
 """
 smooth_objective(glr::GLR{<:SmoothLoss,<:ENR}, n) = glr.loss + get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.)
 
+smooth_gram_objective(glr::GLR{<:SmoothLoss,<:ENR}, XX, Xy, n) =
+    θ -> (θ'*XX*θ)/2 - (θ'*Xy) + (get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.))(θ)
+
 smooth_objective(::GLR) = @error "Case not implemented yet."
 
 """
diff --git a/src/mlj/classifiers.jl b/src/mlj/classifiers.jl
@@ -65,7 +65,7 @@ See also [`MultinomialClassifier`](@ref).
     """some instance of `MLJLinearModels.S` where `S` is one of: `LBFGS`, `Newton`,
     `NewtonCG`, `ProxGrad`; but subject to the following restrictions:
 
-    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxyGrad` is the only
+    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxGrad` is the only
       option.
 
     - Unless `scitype(y) <: Finite{2}` (binary target) `Newton` is disallowed.
@@ -142,7 +142,7 @@ See also [`LogisticClassifier`](@ref).
     """some instance of `MLJLinearModels.S` where `S` is one of: `LBFGS`,
     `NewtonCG`, `ProxGrad`; but subject to the following restrictions:
 
-    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxyGrad` is the only
+    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxGrad` is the only
       option.
 
     - Unless `scitype(y) <: Finite{2}` (binary target) `Newton` is disallowed.
diff --git a/src/utils.jl b/src/utils.jl
@@ -9,6 +9,29 @@ function check_nrows(X::AbstractMatrix, y::AbstractVecOrMat)::Nothing
     throw(DimensionMismatch("`X` and `y` must have the same number of rows."))
 end
 
+function verify_or_construct_gramian(glr, X, y, data)
+    check_nrows(X, y)
+    isnothing(data) && return (; XX = X'X, Xy = X'y, n = length(y))
+
+    !all(hasproperty.(Ref(data), (:XX, :Xy, :n))) && throw(ArgumentError("data must contain XX, Xy, n"))
+    size(data.XX, 1) != size(data.Xy, 1) && throw(DimensionMismatch("`XX` and Xy` must have the same number of rows."))
+    !issymmetric(data.XX) && throw(ArgumentError("Input `XX` must be symmetric"))
+
+    c = getc(glr, data.Xy)
+    !iszero(c) && throw(ArgumentError("Categorical loss not supported with Gramian kernel"))
+    glr.fit_intercept && throw(ArgumentError("Intercept not supported with Gramian kernel"))
+
+    if any(!isempty, (X, y))
+        all((
+            isapprox(X'X, data.XX; rtol=1e-5),
+            isapprox(X'y, data.Xy; rtol=1e-5),
+            length(y) == data.n
+        )) || throw(ArgumentError("Inputs `X` and `y` do not match inputs `XX` and `Xy`."))
+    end
+
+    return data
+end
+
 """
 $SIGNATURES
 
diff --git a/test/fit/ols-ridge-lasso-elnet.jl b/test/fit/ols-ridge-lasso-elnet.jl
@@ -146,3 +146,18 @@ end
         @test nnz(θ_sk) == 8
     end
 end
+
+@testset "gramian" begin
+    λ = 0.1
+    γ = 0.1
+    enr = ElasticNetRegression(λ, γ; fit_intercept=false,
+                                     scale_penalty_with_samples=false)    
+    XX = X'X
+    Xy = X'y
+    n = size(X, 1)
+    θ_fista = fit(enr, X, y; solver=FISTA(max_iter=5000))
+    θ_gram_explicit = fit(enr; data=(; XX, Xy, n), solver=FISTA(max_iter=5000, gram=true))
+    θ_gram_implicit = fit(enr, X, y; solver=FISTA(max_iter=5000, gram=true))
+    @test isapprox(θ_fista, θ_gram_explicit, rtol=1e-5)
+    @test isapprox(θ_gram_explicit, θ_gram_implicit; rtol=1e-5)
+end