[SYSTEMDS-3553] Additional nn optimizers (AdamW, ScaledGD)

ReneEnjilian · mboehm7 · commit 848cfcc667e2 · 2025-02-01T10:30:59.000+01:00
DIA WiSe 24/25 project Closes #2206.
diff --git a/scripts/nn/optim/adamw.dml b/scripts/nn/optim/adamw.dml
@@ -0,0 +1,97 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Adam optimizer with weight decay (AdamW)
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2,
+    double epsilon, double lambda, int t, matrix[double] m, matrix[double] v)
+  return (matrix[double] X, matrix[double] m, matrix[double] v)
+{
+  /*
+   * Performs an AdamW update.
+   *
+   * Reference:
+   *  - Decoupled Weight Decay Regularization, Ilya Loshchilov, Frank Hutter.
+   *  - https://arxiv.org/abs/1711.05101v3
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
+   *  - lr: Learning rate.  Recommended value is 0.001.
+   *  - beta1: Exponential decay rate for the 1st moment estimates.
+   *      Recommended value is 0.9.
+   *  - beta2: Exponential decay rate for the 2nd moment estimates.
+   *      Recommended value is 0.999.
+   *  - epsilon: Smoothing term to avoid divide by zero errors.
+   *      Recommended value is 1e-8.
+   *  - lambda: Weight decay factor that penalizes large weights.
+   *       Recommended value is 0.01
+   *  - t: Timestep, starting at 0.
+   *  - m: State containing the 1st moment (mean) estimate by
+   *      maintaining exponential moving averages of the gradients, of
+   *      same shape as `X`.
+   *  - v: State containing the 2nd raw moment (uncentered variance)
+   *      estimate by maintaining exponential moving averages of the
+   *      squared gradients, of same shape as `X`.
+   *
+   * Outputs:
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - m: Updated state containing the 1st moment (mean) estimate by
+   *      maintaining exponential moving averages of the gradients, of
+   *      same shape as `X`.
+   *  - v: Updated state containing the 2nd raw moment (uncentered
+   *      variance) estimate by maintaining exponential moving averages
+   *      of the squared gradients, of same shape as `X`.
+   */
+   t = t + 1
+   m = beta1*m + (1-beta1)*dX  # update biased 1st moment estimate
+   v = beta2*v + (1-beta2)*dX^2  # update biased 2nd raw moment estimate
+   m_hat = m / (1-beta1^t)  # compute bias-corrected 1st moment estimate
+   v_hat = v / (1-beta2^t)  # compute bias-corrected 2nd raw moment estimate
+   X = X - lr * (m_hat / (sqrt(v_hat) + epsilon) + lambda * X)
+}
+
+init = function(matrix[double] X)
+  return (matrix[double] m, matrix[double] v)
+{
+  /*
+   * Initialize the state for this optimizer.
+   *
+   * Note: This is just a convenience function, and state
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *
+   * Outputs:
+   *  - m: Initial state containing the 1st moment (mean) estimate by
+   *      maintaining exponential moving averages of the gradients, of
+   *      same shape as `X`.
+   *  - v: Initial state containing the 2nd raw moment (uncentered
+   *      variance) estimate by maintaining exponential moving averages
+   *      of the squared gradients, of same shape as `X`.
+   */
+  m = matrix(0, rows=nrow(X), cols=ncol(X))
+  v = matrix(0, rows=nrow(X), cols=ncol(X))
+}
diff --git a/scripts/nn/optim/scaled_gd.dml b/scripts/nn/optim/scaled_gd.dml
@@ -0,0 +1,151 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * ScaledGD optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] Y,
+    matrix[double] dX, matrix[double] dY, double lr, int r)
+  return (matrix[double] X_new, matrix[double] Y_new)
+{
+  /*
+   * Performs one iteration of the Scaled Gradient Descent update.
+   *
+   * Reference:
+   *   - "Accelerating Ill-Conditioned Low-Rank Matrix Estimation
+   *      via Scaled Gradient Descent" (arXiv:2005.08898).
+   *
+   * Typical Steps:
+   * 1) Orthonormal Extension (dimension doubling):
+   *    - Extend X and Y to [X, X⊥] and [Y, Y⊥] so each is m×2r and n×2r,
+   *      with orthonormal columns.
+   * 2) Gradient Step:
+   *    - Subtract lr*dX and lr*dY in the extended (2r) space.
+   * 3) Rank-r Truncation:
+   *    - Recompute X_new, Y_new by SVD on the updated matrix
+   *      of size m×n (i.e., from (X̂ - lr*Gx̂)*(Ŷ - lr*Gŷ)ᵀ).
+   *    - Take only top-r singular values and corresponding vectors.
+   *
+   * Inputs:
+   *  - X: Current m×r matrix (factor or parameter).
+   *  - Y: Current n×r matrix (factor or parameter).
+   *  - dX: Gradient w.r.t. X, same shape as X.
+   *  - dY: Gradient w.r.t. Y, same shape as Y.
+   *  - lr: Learning rate (scalar).
+   *  - r: Target rank for the low-rank approximation.
+   *
+   * Outputs:
+   *  - X_new: Updated factor/parameter matrix (m×r).
+   *  - Y_new: Updated factor/parameter matrix (n×r).
+   */
+
+  #-----------------------------------------------------------
+  # 1) Orthonormal Extension for X and Y
+  #-----------------------------------------------------------
+  # We will form orthonormal complements for X and Y, each adding r columns.
+  # For simplicity, below we create random matrices and orthonormalize them.
+  # In the future, we might use more advanced approaches (QR-based or
+  # local expansions relevant to the gradient directions).
+  X_rand = rand(rows=nrow(X), cols=r)
+  Y_rand = rand(rows=nrow(Y), cols=r)
+
+  # Orthonormalize X
+  X_ext = cbind(X, X_rand)
+  # QR Decomposition: turn X_ext into an orthonormal basis.
+  # Note: SystemDS's 'qr' returns Q,R as multi-return.
+  [QX, RX] = qr(X_ext)
+  # We'll keep just 2r columns of Q (since Q might have dimension m×m)
+  X_hat = QX[, 1:(2*r)]
+
+  # Orthonormalize Y
+  Y_ext = cbind(Y, Y_rand)
+  [QY, RY] = qr(Y_ext)
+  Y_hat = QY[, 1:(2*r)]
+
+  #-----------------------------------------------------------
+  # 2) Gradient Step in Expanded Space
+  #-----------------------------------------------------------
+  # Similarly, we need the gradients w.r.t X_hat, Y_hat. If 'dX' and 'dY'
+  # are for the original X, Y, a simple approach is to "expand" them
+  # by zero-padding for the extra columns.
+  dX_ext = cbind(dX, matrix(0, rows=nrow(X), cols=r))
+  dY_ext = cbind(dY, matrix(0, rows=nrow(Y), cols=r))
+
+  # Update step: X_hat_temp = X_hat - lr * dX_ext, etc.
+  X_hat_temp = X_hat - (lr * dX_ext)
+  Y_hat_temp = Y_hat - (lr * dY_ext)
+
+  #-----------------------------------------------------------
+  # 3) Rank-r Truncation via SVD
+  #-----------------------------------------------------------
+  # Construct a temporary matrix M_temp = X_hat_temp * (Y_hat_temp)ᵀ
+  M_temp = X_hat_temp %*% t(Y_hat_temp)
+
+  # SVD returns multiple outputs: U, S, and V
+  [U, S, V] = svd(M_temp)
+
+  # We will keep only the top-r singular values
+  # Note: S is a diagonal matrix. We can slice it or build from the diag vector.
+  S_diag = diag(S)
+  s_top = S_diag[1:r]
+  U_top = U[, 1:r]
+  V_top = V[, 1:r]
+
+  # Reconstruct X, Y from the rank-r approximation:
+  # M_temp ≈ U_top * diag(s_top) * V_topᵀ
+  # Often we store X_new = U_top * sqrt(diag(s_top)), Y_new = V_top * sqrt(diag(s_top))
+  sqrt_s_top = sqrt(s_top)
+  X_new = U_top %*% diag(sqrt_s_top)
+  Y_new = V_top %*% diag(sqrt_s_top)
+}
+
+init = function(matrix[double] X, matrix[double] Y)
+  return (int r)
+{
+  /*
+   * Here, we treat the number of columns (r) of X and Y
+   * as the "rank parameter" for ScaledGD.
+   * This parameter r is an upper bound on the actual
+   * algebraic rank, because some columns may become
+   * linearly dependent.
+   *
+   * Note: This is just a convenience function, and rank
+   * may be initialized manually if needed.
+
+   * Inputs:
+   *  - X: Current m×r matrix (factor or parameter).
+   *  - Y: Current n×r matrix (factor or parameter).
+   *
+   * Outputs:
+   *  - r: upper bound for rank
+   *
+   *
+   */
+
+  if (ncol(X) != ncol(Y)) {
+      stop("X and Y must have the same number of columns in ScaledGD init.")
+  }
+
+  # The rank parameter (upper bound) is simply the number of columns in X
+  r = ncol(X)
+}
+