Merge pull request #20 from alan-turing-institute/split-linmodels

OkonSamuel · web-flow · commit 93ab5b4c139e · 2021-02-02T12:13:15.000-08:00
Split `LinearRegresor` and `RidgeRegressor` models
diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
@@ -14,8 +14,7 @@ using LinearAlgebra
 
 # ===================================================================
 ## EXPORTS
-export LinearRegressor, RidgeRegressor, PCA, KernelPCA, ICA, PPCA, FactorAnalysis, LDA,
-    BayesianLDA, SubspaceLDA, BayesianSubspaceLDA
+# Models are exported automatically by `@mlj_model` macro
 
 # ===================================================================
 ## Re-EXPORTS
@@ -34,70 +33,84 @@ const FactorAnalysisResultType = MS.FactorAnalysis
 const default_kernel = (x, y) -> x'y #default kernel used in KernelPCA
 
 # Definitions of model descriptions for use in model doc-strings.
-const PCA_DESCR = """Principal component analysis. Learns a linear transformation to
-project the data  on a lower dimensional space while preserving most of the initial
-variance.
-"""
+const PCA_DESCR = """
+      Principal component analysis. Learns a linear transformation to
+    project the data  on a lower dimensional space while preserving most of the initial
+    variance.
+    """
 const KPCA_DESCR = "Kernel principal component analysis."
 const ICA_DESCR = "Independent component analysis."
 const PPCA_DESCR = "Probabilistic principal component analysis"
 const FactorAnalysis_DESCR = "Factor Analysis"
-const LDA_DESCR = """Multiclass linear discriminant analysis. The algorithm learns a
-projection matrix `P` that projects a feature matrix `Xtrain` onto a lower dimensional
-space of dimension `out_dim` such that the trace of the transformed between-class scatter
-matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the transformed within-class
-scatter matrix (`Pᵀ*Sw*P`).The projection matrix is scaled such that `Pᵀ*Sw*P=I` or
-`Pᵀ*Σw*P=I`(where `Σw` is the within-class covariance matrix) .
-Predicted class posterior probability for feature matrix `Xtest` are derived by applying
-a softmax transformationto a matrix `Pr`, such that  rowᵢ of `Pr` contains computed
-distances(based on a distance metric) in the transformed space of rowᵢ in `Xtest` to the
-centroid of each class.
-"""
-const BayesianLDA_DESCR = """Bayesian Multiclass linear discriminant analysis. The algorithm
-learns a projection matrix `P` that projects a feature matrix `Xtrain` onto a lower
-dimensional space of dimension `out_dim` such that the trace of the transformed
-between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled such
-that `Pᵀ*Sw*P = n` or `Pᵀ*Σw*P=I` (Where `n` is the number of training samples and `Σw`
-is the within-class covariance matrix).
-Predicted class posterior probability distibution are derived by applying Bayes rule with
-a multivariate Gaussian class-conditional distribution.
-"""
-const SubspaceLDA_DESCR = """Multiclass linear discriminant analysis. Suitable for high
-dimensional data (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a
-projection matrix `P = W*L` that projects a feature matrix `Xtrain` onto a lower
-dimensional space of dimension `nc - 1` such that the trace of the transformed
-between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled such
-that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of training
-samples, mult` is  one of `n` or `1` depending on whether `Sb` is normalized, `Σw` is the
-within-class covariance matrix, and `nc` is the number of unique classes in `y`) and also
-obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
-Predicted class posterior probability for feature matrix `Xtest` are derived by applying a
-softmax transformation to a matrix `Pr`, such that  rowᵢ of `Pr` contains computed
-distances(based on a distance metric) in the transformed space of rowᵢ in `Xtest` to the
-centroid of each class.
-"""
-const BayesianSubspaceLDA_DESCR = """Bayesian Multiclass linear discriminant analysis.
-Suitable for high dimensional data (Avoids computing scatter matrices `Sw` ,`Sb`). The
-algorithm learns a projection matrix `P = W*L` (`Sw`), that projects a feature matrix
-`Xtrain` onto a lower dimensional space of dimension `nc-1` such that the trace of the
-transformed between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace
-of the transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is
-scaled such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
-training samples, `mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
-`Σw` is the within-class covariance matrix, and `nc` is the number of unique classes in
-`y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
-Posterior class probability distibution are derived by applying Bayes rule with a
-multivariate Gaussian class-conditional distribution
-"""
-const LINEAR_DESCR = """Linear regression. Learns a linear combination(s) of given
-variables to fit the responses by minimizing the squared error between.
-"""
-const RIDGE_DESCR = """Ridge regressor with regularization parameter lambda. Learns a
-linear regression with a penalty on the l2 norm of the coefficients.
-"""
-
+const LDA_DESCR = """
+      Multiclass linear discriminant analysis. The algorithm learns a
+    projection matrix `P` that projects a feature matrix `Xtrain` onto a lower dimensional
+    space of dimension `out_dim` such that the trace of the transformed between-class 
+    scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the transformed 
+    within-class scatter matrix (`Pᵀ*Sw*P`).The projection matrix is scaled such that 
+    `Pᵀ*Sw*P=I` or `Pᵀ*Σw*P=I`(where `Σw` is the within-class covariance matrix) .
+    Predicted class posterior probability for feature matrix `Xtest` are derived by 
+    applying a softmax transformationto a matrix `Pr`, such that  rowᵢ of `Pr` contains 
+    computed distances(based on a distance metric) in the transformed space of rowᵢ in 
+    `Xtest` to the centroid of each class.
+    """
+const BayesianLDA_DESCR = """
+      Bayesian Multiclass linear discriminant analysis. The algorithm
+    learns a projection matrix `P` that projects a feature matrix `Xtrain` onto a lower
+    dimensional space of dimension `out_dim` such that the trace of the transformed
+    between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
+    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled 
+    such that `Pᵀ*Sw*P = n` or `Pᵀ*Σw*P=I` (Where `n` is the number of training samples 
+    and `Σw` is the within-class covariance matrix).
+    Predicted class posterior probability distibution are derived by applying Bayes rule 
+    with a multivariate Gaussian class-conditional distribution.
+    """
+const SubspaceLDA_DESCR = """
+    Multiclass linear discriminant analysis. Suitable for high
+    dimensional data (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a
+    projection matrix `P = W*L` that projects a feature matrix `Xtrain` onto a lower
+    dimensional space of dimension `nc - 1` such that the trace of the transformed
+    between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
+    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled 
+    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of 
+    training samples, mult` is  one of `n` or `1` depending on whether `Sb` is normalized, 
+    `Σw` is the within-class covariance matrix, and `nc` is the number of unique classes 
+    in `y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
+    Predicted class posterior probability for feature matrix `Xtest` are derived by 
+    applying a softmax transformation to a matrix `Pr`, such that  rowᵢ of `Pr` contains 
+    computed distances(based on a distance metric) in the transformed space of rowᵢ in 
+    `Xtest` to the centroid of each class.
+    """
+const BayesianSubspaceLDA_DESCR = """
+       Bayesian Multiclass linear discriminant analysis. Suitable for high dimensional data 
+    (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a projection 
+    matrix `P = W*L` (`Sw`), that projects a feature matrix `Xtrain` onto a lower 
+    dimensional space of dimension `nc-1` such that the trace of the transformed 
+    between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the 
+    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled 
+    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of 
+    training samples, `mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
+    `Σw` is the within-class covariance matrix, and `nc` is the number of unique classes in
+    `y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
+    Posterior class probability distibution are derived by applying Bayes rule with a
+    multivariate Gaussian class-conditional distribution
+    """
+const LinearRegressor_DESCR = """
+    Linear Regression. Learns a linear combination of given
+    variables to fit the response by minimizing the squared error between.
+    """
+const MultitargetLinearRegressor_DESCR = """
+    Multitarget Linear Regression. Learns linear combinations of given
+    variables to fit the responses by minimizing the squared error between.
+    """
+const RidgeRegressor_DESCR = """
+    Ridge regressor with regularization parameter lambda. Learns a
+    linear regression with a penalty on the l2 norm of the coefficients.
+    """
+const MultitargetRidgeRegressor_DESCR = """
+    Multitarget Ridge regressor with regularization parameter lambda. Learns a
+    Multitarget linear regression with a penalty on the l2 norm of the coefficients.
+    """
 const PKG = "MLJMultivariateStatsInterface"
 
 # ===================================================================
diff --git a/src/models/linear_models.jl b/src/models/linear_models.jl
@@ -1,20 +1,6 @@
-####
-#### LinearRegressor
-####
-
-"""
-    LinearRegressor(; bias::Bool=true)
-
-$LINEAR_DESCR
-
-# Keyword Parameters
-
-- `bias::Bool=true`: if true includes a bias term else fits without bias term.
-"""
-@mlj_model mutable struct LinearRegressor <: MMI.Deterministic
-    bias::Bool = true
-end
-
+#######
+## Common Regressor methods
+########
 struct LinearFitresult{T, F<:Real, M<:AbstractArray{F}} <: MMI.MLJType
     sol_matrix::M
     bias::Bool
@@ -37,15 +23,6 @@ function _matrix(X, target)
     return Xmatrix, Y, _names(target)
 end
 
-function MMI.fit(model::LinearRegressor, verbosity::Int, X, y)
-    Xmatrix, y_, target_header= _matrix(X, y)
-    θ = MS.llsq(Xmatrix, y_; bias=model.bias)
-    fitresult = LinearFitresult(θ, model.bias, target_header)
-    report = NamedTuple()
-    cache = nothing
-    return fitresult, cache, report
-end
-
 function _regressor_fitted_params(fr::LinearFitresult{Nothing, <:Real, <:AbstractVector})
     return (
         coefficients=fr.sol_matrix[1:end-Int(fr.bias)],
@@ -60,10 +37,6 @@ function _regressor_fitted_params(fr::LinearFitresult{<:Vector, <:Real, <:Abstra
     )
 end
 
-function MMI.fitted_params(::LinearRegressor, fr)
-    return _regressor_fitted_params(fr)
-end
-
 function _predict_regressor(
     fr::LinearFitresult{Nothing, <:Real, <:AbstractVector},
     Xmat_new::AbstractMatrix,
@@ -98,30 +71,66 @@ function _predict_regressor(
     end
 end
 
-function MMI.predict(::LinearRegressor, fr, Xnew)
+####
+#### LinearRegressor & MultitargetLinearRegressor
+####
+
+"""
+    LinearRegressor(; bias::Bool=true)
+
+$LinearRegressor_DESCR
+
+# Keyword Parameters
+
+- `bias::Bool=true`: if true includes a bias term else fits without bias term.
+"""
+@mlj_model mutable struct LinearRegressor <: MMI.Deterministic
+    bias::Bool = true
+end
+
+"""
+    MultitargetLinearRegressor(; bias::Bool=true)
+
+$MultitargetLinearRegressor_DESCR
+
+# Keyword Parameters
+
+- `bias::Bool=true`: if true includes a bias term else fits without bias term.
+"""
+@mlj_model mutable struct MultitargetLinearRegressor <: MMI.Deterministic
+    bias::Bool = true
+end
+
+const LINREG = Union{LinearRegressor, MultitargetLinearRegressor}
+
+function MMI.fit(model::LINREG, verbosity::Int, X, y)
+    Xmatrix, y_, target_header= _matrix(X, y)
+    θ = MS.llsq(Xmatrix, y_; bias=model.bias)
+    fitresult = LinearFitresult(θ, model.bias, target_header)
+    report = NamedTuple()
+    cache = nothing
+    return fitresult, cache, report
+end
+
+function MMI.fitted_params(::LINREG, fr)
+    return _regressor_fitted_params(fr)
+end
+
+function MMI.predict(::LINREG, fr, Xnew)
     Xmat_new = MMI.matrix(Xnew)
     return _predict_regressor(fr, Xmat_new, Xnew)
 end
 
-metadata_model(
-    LinearRegressor,
-    input=Table(Continuous),
-    target=Union{Table(Continuous), AbstractVector{Continuous}},
-    weights=false,
-    descr=LINEAR_DESCR,
-    path="$(PKG).LinearRegressor"
-)
-
 ####
-#### RidgeRegressor
+#### RidgeRegressor & MultitargetRidgeRegressor
 ####
 
 _check_typeof_lambda(x)= x isa AbstractVecOrMat || (x isa Real && x ≥ 0)
 
 """
     RidgeRegressor(; lambda::Union{Real, AbstractVecOrMat}=1.0, bias::Bool=true)
 
-$RIDGE_DESCR
+$RidgeRegressor_DESCR
 
 # Keyword Parameters
 
@@ -134,7 +143,25 @@ $RIDGE_DESCR
     bias::Bool = true
 end
 
-function MMI.fit(model::RidgeRegressor, verbosity::Int, X, y)
+"""
+    MultitargetRidgeRegressor(; lambda::Union{Real, AbstractVecOrMat}=1.0, bias::Bool=true)
+
+$MultitargetRidgeRegressor_DESCR
+
+# Keyword Parameters
+
+- `lambda::Union{Real, AbstractVecOrMat}=1.0`: non-negative parameter for the 
+    regularization strength.
+- `bias::Bool=true`: if true includes a bias term else fits without bias term.
+"""
+@mlj_model mutable struct MultitargetRidgeRegressor <: MMI.Deterministic
+    lambda::Union{Real, AbstractVecOrMat} = 1.0::(_check_typeof_lambda(_))
+    bias::Bool = true
+end
+
+const RIDGEREG = Union{RidgeRegressor, MultitargetRidgeRegressor}
+
+function MMI.fit(model::RIDGEREG, verbosity::Int, X, y)
     Xmatrix, y_, target_header = _matrix(X, y)
     θ = MS.ridge(Xmatrix, y_, model.lambda; bias=model.bias)
     fitresult = LinearFitresult(θ, model.bias, target_header)
@@ -143,20 +170,52 @@ function MMI.fit(model::RidgeRegressor, verbosity::Int, X, y)
     return fitresult, cache, report
 end
 
-function MMI.fitted_params(::RidgeRegressor, fr)
+function MMI.fitted_params(::RIDGEREG, fr)
     return _regressor_fitted_params(fr)
 end
 
-function MMI.predict(::RidgeRegressor, fr, Xnew)
+function MMI.predict(::RIDGEREG, fr, Xnew)
     Xmat_new = MMI.matrix(Xnew)
     return _predict_regressor(fr, Xmat_new, Xnew)
 end
 
+
+############
+### Models Metadata
+############
+metadata_model(
+    LinearRegressor,
+    input=Table(Continuous),
+    target=AbstractVector{Continuous},
+    weights=false,
+    descr=LinearRegressor_DESCR,
+    path="$(PKG).LinearRegressor"
+)
+
+metadata_model(
+    MultitargetLinearRegressor,
+    input=Table(Continuous),
+    target=Table(Continuous),
+    weights=false,
+    descr=MultitargetLinearRegressor_DESCR,
+    path="$(PKG).MultitargetLinearRegressor"
+)
+
 metadata_model(
     RidgeRegressor,
     input=Table(Continuous),
-    target=Union{Table(Continuous), AbstractVector{Continuous}},
+    target=AbstractVector{Continuous},
     weights=false,
-    descr=RIDGE_DESCR,
+    descr=RidgeRegressor_DESCR ,
     path="$(PKG).RidgeRegressor"
 )
+
+metadata_model(
+    MultitargetRidgeRegressor,
+    input=Table(Continuous),
+    target=Table(Continuous),
+    weights=false,
+    descr=MultitargetRidgeRegressor_DESCR,
+    path="$(PKG).MultitargetRidgeRegressor"
+)
+
diff --git a/test/models/linear_models.jl b/test/models/linear_models.jl