|
| 1 | +#------------------------------------------------------------- |
| 2 | +# |
| 3 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 4 | +# or more contributor license agreements. See the NOTICE file |
| 5 | +# distributed with this work for additional information |
| 6 | +# regarding copyright ownership. The ASF licenses this file |
| 7 | +# to you under the Apache License, Version 2.0 (the |
| 8 | +# "License"); you may not use this file except in compliance |
| 9 | +# with the License. You may obtain a copy of the License at |
| 10 | +# |
| 11 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 12 | +# |
| 13 | +# Unless required by applicable law or agreed to in writing, |
| 14 | +# software distributed under the License is distributed on an |
| 15 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 16 | +# KIND, either express or implied. See the License for the |
| 17 | +# specific language governing permissions and limitations |
| 18 | +# under the License. |
| 19 | +# |
| 20 | +#------------------------------------------------------------- |
| 21 | + |
| 22 | +/* |
| 23 | + * ScaledGD optimizer. |
| 24 | + */ |
| 25 | + |
| 26 | +update = function(matrix[double] X, matrix[double] Y, |
| 27 | + matrix[double] dX, matrix[double] dY, double lr, int r) |
| 28 | + return (matrix[double] X_new, matrix[double] Y_new) |
| 29 | +{ |
| 30 | + /* |
| 31 | + * Performs one iteration of the Scaled Gradient Descent update. |
| 32 | + * |
| 33 | + * Reference: |
| 34 | + * - "Accelerating Ill-Conditioned Low-Rank Matrix Estimation |
| 35 | + * via Scaled Gradient Descent" (arXiv:2005.08898). |
| 36 | + * |
| 37 | + * Typical Steps: |
| 38 | + * 1) Orthonormal Extension (dimension doubling): |
| 39 | + * - Extend X and Y to [X, X⊥] and [Y, Y⊥] so each is m×2r and n×2r, |
| 40 | + * with orthonormal columns. |
| 41 | + * 2) Gradient Step: |
| 42 | + * - Subtract lr*dX and lr*dY in the extended (2r) space. |
| 43 | + * 3) Rank-r Truncation: |
| 44 | + * - Recompute X_new, Y_new by SVD on the updated matrix |
| 45 | + * of size m×n (i.e., from (X̂ - lr*Gx̂)*(Ŷ - lr*Gŷ)ᵀ). |
| 46 | + * - Take only top-r singular values and corresponding vectors. |
| 47 | + * |
| 48 | + * Inputs: |
| 49 | + * - X: Current m×r matrix (factor or parameter). |
| 50 | + * - Y: Current n×r matrix (factor or parameter). |
| 51 | + * - dX: Gradient w.r.t. X, same shape as X. |
| 52 | + * - dY: Gradient w.r.t. Y, same shape as Y. |
| 53 | + * - lr: Learning rate (scalar). |
| 54 | + * - r: Target rank for the low-rank approximation. |
| 55 | + * |
| 56 | + * Outputs: |
| 57 | + * - X_new: Updated factor/parameter matrix (m×r). |
| 58 | + * - Y_new: Updated factor/parameter matrix (n×r). |
| 59 | + */ |
| 60 | + |
| 61 | + #----------------------------------------------------------- |
| 62 | + # 1) Orthonormal Extension for X and Y |
| 63 | + #----------------------------------------------------------- |
| 64 | + # We will form orthonormal complements for X and Y, each adding r columns. |
| 65 | + # For simplicity, below we create random matrices and orthonormalize them. |
| 66 | + # In the future, we might use more advanced approaches (QR-based or |
| 67 | + # local expansions relevant to the gradient directions). |
| 68 | + X_rand = rand(rows=nrow(X), cols=r) |
| 69 | + Y_rand = rand(rows=nrow(Y), cols=r) |
| 70 | + |
| 71 | + # Orthonormalize X |
| 72 | + X_ext = cbind(X, X_rand) |
| 73 | + # QR Decomposition: turn X_ext into an orthonormal basis. |
| 74 | + # Note: SystemDS's 'qr' returns Q,R as multi-return. |
| 75 | + [QX, RX] = qr(X_ext) |
| 76 | + # We'll keep just 2r columns of Q (since Q might have dimension m×m) |
| 77 | + X_hat = QX[, 1:(2*r)] |
| 78 | + |
| 79 | + # Orthonormalize Y |
| 80 | + Y_ext = cbind(Y, Y_rand) |
| 81 | + [QY, RY] = qr(Y_ext) |
| 82 | + Y_hat = QY[, 1:(2*r)] |
| 83 | + |
| 84 | + #----------------------------------------------------------- |
| 85 | + # 2) Gradient Step in Expanded Space |
| 86 | + #----------------------------------------------------------- |
| 87 | + # Similarly, we need the gradients w.r.t X_hat, Y_hat. If 'dX' and 'dY' |
| 88 | + # are for the original X, Y, a simple approach is to "expand" them |
| 89 | + # by zero-padding for the extra columns. |
| 90 | + dX_ext = cbind(dX, matrix(0, rows=nrow(X), cols=r)) |
| 91 | + dY_ext = cbind(dY, matrix(0, rows=nrow(Y), cols=r)) |
| 92 | + |
| 93 | + # Update step: X_hat_temp = X_hat - lr * dX_ext, etc. |
| 94 | + X_hat_temp = X_hat - (lr * dX_ext) |
| 95 | + Y_hat_temp = Y_hat - (lr * dY_ext) |
| 96 | + |
| 97 | + #----------------------------------------------------------- |
| 98 | + # 3) Rank-r Truncation via SVD |
| 99 | + #----------------------------------------------------------- |
| 100 | + # Construct a temporary matrix M_temp = X_hat_temp * (Y_hat_temp)ᵀ |
| 101 | + M_temp = X_hat_temp %*% t(Y_hat_temp) |
| 102 | + |
| 103 | + # SVD returns multiple outputs: U, S, and V |
| 104 | + [U, S, V] = svd(M_temp) |
| 105 | + |
| 106 | + # We will keep only the top-r singular values |
| 107 | + # Note: S is a diagonal matrix. We can slice it or build from the diag vector. |
| 108 | + S_diag = diag(S) |
| 109 | + s_top = S_diag[1:r] |
| 110 | + U_top = U[, 1:r] |
| 111 | + V_top = V[, 1:r] |
| 112 | + |
| 113 | + # Reconstruct X, Y from the rank-r approximation: |
| 114 | + # M_temp ≈ U_top * diag(s_top) * V_topᵀ |
| 115 | + # Often we store X_new = U_top * sqrt(diag(s_top)), Y_new = V_top * sqrt(diag(s_top)) |
| 116 | + sqrt_s_top = sqrt(s_top) |
| 117 | + X_new = U_top %*% diag(sqrt_s_top) |
| 118 | + Y_new = V_top %*% diag(sqrt_s_top) |
| 119 | +} |
| 120 | + |
| 121 | +init = function(matrix[double] X, matrix[double] Y) |
| 122 | + return (int r) |
| 123 | +{ |
| 124 | + /* |
| 125 | + * Here, we treat the number of columns (r) of X and Y |
| 126 | + * as the "rank parameter" for ScaledGD. |
| 127 | + * This parameter r is an upper bound on the actual |
| 128 | + * algebraic rank, because some columns may become |
| 129 | + * linearly dependent. |
| 130 | + * |
| 131 | + * Note: This is just a convenience function, and rank |
| 132 | + * may be initialized manually if needed. |
| 133 | + |
| 134 | + * Inputs: |
| 135 | + * - X: Current m×r matrix (factor or parameter). |
| 136 | + * - Y: Current n×r matrix (factor or parameter). |
| 137 | + * |
| 138 | + * Outputs: |
| 139 | + * - r: upper bound for rank |
| 140 | + * |
| 141 | + * |
| 142 | + */ |
| 143 | + |
| 144 | + if (ncol(X) != ncol(Y)) { |
| 145 | + stop("X and Y must have the same number of columns in ScaledGD init.") |
| 146 | + } |
| 147 | + |
| 148 | + # The rank parameter (upper bound) is simply the number of columns in X |
| 149 | + r = ncol(X) |
| 150 | +} |
| 151 | + |
0 commit comments