addressed PR comments

mkhona-nvidia · mkhona-nvidia · commit dab19bcd887b · 2025-10-07T15:50:14.000-07:00
Signed-off-by: mikail &lt;mkhona@nvidia.com&gt;
diff --git a/emerging_optimizers/psgd/procrustes_step.py b/emerging_optimizers/psgd/procrustes_step.py
@@ -38,6 +38,7 @@ def procrustes_step(Q: torch.Tensor, max_step_size: float = 0.125) -> torch.Tens
         Q: Tensor of shape (n, n), general square matrix to orthogonalize.
         max_step_size: Maximum step size for the line search. Default is 1/8. (0.125)
     """
+    # Note: this function is written in fp32 to avoid numerical instability while computing the taylor expansion of the exponential map
     with utils.fp32_matmul_precision("highest"):
         R = Q.T - Q
         R /= norm_lower_bound_skew(R) + torch.finfo(R.dtype).smallest_normal
@@ -48,11 +49,12 @@ def procrustes_step(Q: torch.Tensor, max_step_size: float = 0.125) -> torch.Tens
         RRQ = R @ RQ
         tr_RRQ = torch.trace(RRQ)
         # clip step size to max_step_size, based on a 2nd order expansion.
-        step_size = torch.clamp(-tr_RQ / tr_RRQ, min=0, max=max_step_size)
+        _step_size = torch.clamp(-tr_RQ / tr_RRQ, min=0, max=max_step_size)
         # If tr_RRQ >= 0, the quadratic approximation is not concave, we fallback to max_step_size.
-        a = torch.where(tr_RRQ < 0, step_size, max_step_size)
+        step_size = torch.where(tr_RRQ < 0, _step_size, max_step_size)
         # rotate Q as exp(a R) Q ~ (I + a R + a^2 R^2/2) Q with an optimal step size by line search
         # for 2nd order expansion, only expand exp(a R) to its 2nd term.
-        Q += a * (RQ + 0.5 * a * RRQ)
+        # Q += step_size * (RQ + 0.5 * step_size * RRQ)
+        Q = torch.add(Q, torch.add(RQ, RRQ, alpha=0.5 * step_size), alpha=step_size)
 
     return Q
diff --git a/emerging_optimizers/psgd/psgd_kron_contractions.py b/emerging_optimizers/psgd/psgd_kron_contractions.py
@@ -90,39 +90,43 @@ def apply_preconditioner(Q_list: List[torch.Tensor], X: torch.Tensor) -> torch.T
     return Px
 
 
-def _mode_n_mul_and_permute(X: torch.Tensor, M: torch.Tensor, mode: int) -> torch.Tensor:
-    """Multiply tensor X along axis `mode` by 2D matrix M.
+def _dim_n_mul_and_permute(X: torch.Tensor, M: torch.Tensor, contract_dim: int) -> torch.Tensor:
+    """Multiply tensor X along axis `contract_dim` by 2D matrix M.
 
     Helper function for `_apply_single_kronecker_factor`.
-    If M is (d_out, d_in) we contract M’s second index with X’s `mode` index.
-    `torch.tensordot` is used to contract the two tensors, and then the result is permuted to move the new axis 0 to position `mode`.
-    Returns a new tensor of the same rank, but with size[mode] replaced by d_out.
-    Note that d_{mode} == d_in.
+    If M is (d_out, d_in) we contract M’s second index with X’s `contract_dim` index.
+    `torch.tensordot` is used to contract the two tensors, and then the result is permuted to move the new axis 0 to position `contract_dim`.
+    Returns a new tensor of the same rank, but with size[contract_dim] replaced by d_out.
+    Note that d_{contract_dim} == d_in.
 
     Args:
-        X: Tensor of shape (d_0, d_1, ..., d_{mode-1}, d_{mode}, d_{mode+1}, ..., d_N)
+        X: Tensor of shape (d_0, d_1, ..., d_{contract_dim-1}, d_{contract_dim}, d_{contract_dim+1}, ..., d_N)
         M: Tensor of shape (d_out, d_in)
-        mode: int, the mode to contract with M, with d_{mode} == d_in
+        contract_dim: int, the dimension to contract with M, with d_{contract_dim} == d_in
 
     Returns:
-        Tensor of shape (d_0, d_1, ..., d_{mode-1}, d_out, d_{mode+1}, ..., d_N)
+        Tensor of shape (d_0, d_1, ..., d_{contract_dim-1}, d_out, d_{contract_dim+1}, ..., d_N)
 
-    Example:
-        X = torch.randn(2, 3, 6)
-        M = torch.randn(5, 6)
-        mode = 2
-        result = _mode_n_mul_and_permute(X, M, mode)
-        print(result.shape)  # Output: torch.Size([2, 3, 5])
+    Examples
+    --------
+    >>> X = torch.randn(2, 3, 6)
+    >>> M = torch.randn(5, 6)
+    >>> contract_dim = 2
+    >>> result = _dim_n_mul_and_permute(X, M, contract_dim)
+    >>> print(result.shape)
+    torch.Size([2, 3, 5])
 
     """
-    if X.shape[mode] != M.shape[1]:
-        raise ValueError(f"Shape mismatch: X.shape[{mode}] = {X.shape[mode]}, M.shape[1] = {M.shape[1]}")
-    # Contract M's 2nd dim (idx=1) with X's `mode` dim
-    Y = torch.tensordot(M, X, dims=([1], [mode]))
-    # Y now has shape (d_out, d_0, …, d_{mode-1}, d_{mode+1}, …).
-    # We want to move that new axis 0 back to position `mode`, due to `torch.tensordot`.
+    if X.shape[contract_dim] != M.shape[1]:
+        raise ValueError(
+            f"Shape mismatch: X.shape[{contract_dim}] = {X.shape[contract_dim]}, M.shape[1] = {M.shape[1]}"
+        )
+    # Contract M's 2nd dim (idx=1) with X's `contract_dim` dim
+    Y = torch.tensordot(M, X, dims=([1], [contract_dim]))
+    # Y now has shape (d_out, d_0, …, d_{contract_dim-1}, d_{contract_dim+1}, …).
+    # We want to move that new axis 0 back to position `contract_dim`, due to `torch.tensordot`.
     nd = X.dim()
-    perm = list(range(1, mode + 1)) + [0] + list(range(mode + 1, nd))
+    perm = list(range(1, contract_dim + 1)) + [0] + list(range(contract_dim + 1, nd))
     return Y.permute(perm)
 
 
@@ -141,5 +145,5 @@ def _apply_single_kronecker_factor(Q_list: List[torch.Tensor], X: torch.Tensor,
         shape = [1] * X.dim()
         shape[axis] = Q.size(0)
         return X * Q.view(shape)
-    else:
-        return _mode_n_mul_and_permute(X, Q, mode=axis)
+
+    return _dim_n_mul_and_permute(X, Q, contract_dim=axis)
diff --git a/emerging_optimizers/psgd/psgd_utils.py b/emerging_optimizers/psgd/psgd_utils.py
@@ -18,14 +18,14 @@
 
 
 __all__ = [
-    "balance_q_in_place",
+    "uniformize_q_in_place",
     "norm_lower_bound_spd",
     "norm_lower_bound_skew",
 ]
 
 
 @torch.compile  # type: ignore[misc]
-def balance_q_in_place(Q_list: List[torch.Tensor]) -> None:
+def uniformize_q_in_place(Q_list: List[torch.Tensor]) -> None:
     """Balance the dynamic ranges of kronecker factors in place to prevent numerical underflow or overflow.
 
     Each tensor in `Q_list` is rescaled so that its maximum absolute entry
@@ -71,7 +71,7 @@ def balance_q_in_place(Q_list: List[torch.Tensor]) -> None:
 
 @torch.compile  # type: ignore[misc]
 def norm_lower_bound_spd(A: torch.Tensor, k: int = 4, half_iters: int = 2, eps: float = 1e-8) -> torch.Tensor:
-    r"""Returns a cheap lower bound for the spectral norm of a symmetric positive definite matrix.
+    r"""A cheap lower bound for the spectral norm of a symmetric positive definite matrix.
 
 
     Args:
@@ -84,7 +84,7 @@ def norm_lower_bound_spd(A: torch.Tensor, k: int = 4, half_iters: int = 2, eps:
         A scalar giving a lower bound on :math:`\\|A\\|_2`.
     """
 
-    # Compute normalizing factor from the largest diagonal entry to prevent overflow/underflow and use smallest representable normal number for numerical stability
+    # Compute normalizing factor from the largest diagonal entry to prevent overflow/underflow and use small number for numerical stability
     normalization = A.diagonal().amax() + eps
     A = A / normalization
 
@@ -95,7 +95,7 @@ def norm_lower_bound_spd(A: torch.Tensor, k: int = 4, half_iters: int = 2, eps:
 
 @torch.compile  # type: ignore[misc]
 def norm_lower_bound_skew(A: torch.Tensor, k: int = 32, half_iters: int = 2, eps: float = 1e-8) -> torch.Tensor:
-    """Compute a cheap lower bound on the spectral norm (largest eigenvalue) of skew-symmetric matrix.
+    """A cheap lower bound on the spectral norm (largest eigenvalue) of skew-symmetric matrix.
 
 
     Note: For skew-symmetric matrices, all diagonal entries are zero and :math:`A^T = -A`.
@@ -112,7 +112,7 @@ def norm_lower_bound_skew(A: torch.Tensor, k: int = 32, half_iters: int = 2, eps
 
     """
 
-    # Normalize to avoid extreme values, by extracting the max absolute value and use smallest representable normal number for numerical stability
+    # Normalize to avoid extreme values, by extracting the max absolute value and use small number for numerical stability
     normalizing_factor = A.abs().amax() + eps
     A = A / normalizing_factor
 
@@ -128,7 +128,7 @@ def _subspace_iteration_bound(
     half_iters: int = 2,
     eps: float = 1e-8,
 ) -> torch.Tensor:
-    """Helper function for subspace iteration to estimate spectral norm bounds.
+    """A helper function for subspace iteration to estimate spectral norm bounds.
 
     Uses numerically stable subspace iteration with a random initialization that aligns with the
     largest row of A to approximate the dominant eigenspace. This is more robust than simple
diff --git a/tests/test_procrustes_step.py b/tests/test_procrustes_step.py
@@ -15,20 +15,25 @@
 import math
 
 import torch
-from absl import testing
+from absl import flags, testing
 from absl.testing import parameterized
 
 from emerging_optimizers.psgd.procrustes_step import procrustes_step
 from emerging_optimizers.utils import fp32_matmul_precision
 
 
+# Define command line flags
+flags.DEFINE_string("device", "cpu", "Device to run tests on: 'cpu' or 'cuda'")
+
+FLAGS = flags.FLAGS
+
+
 class ProcrustesStepTest(parameterized.TestCase):
     """Test cases for procrustes_step function."""
 
     def setUp(self) -> None:
         """Set up test fixtures."""
-        torch.manual_seed(42)
-        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        self.device = FLAGS.device
 
     def _procrustes_objective(self, Q: torch.Tensor) -> torch.Tensor:
         """Helper function to compute Procrustes objective ||Q^H Q - I||_F^2."""
@@ -49,15 +54,6 @@ def test_improves_orthogonality_simple_case(self) -> None:
 
         self.assertLessEqual(final_obj.item(), initial_obj.item() + 1e-6)
 
-    def test_modifies_matrix_in_place(self) -> None:
-        """Test that procrustes_step modifies the matrix in place."""
-        Q = torch.randn(3, 3, device=self.device)
-        Q_original_id = id(Q)
-
-        Q = procrustes_step(Q, max_step_size=1 / 16)
-
-        self.assertEqual(id(Q), Q_original_id)
-
     @parameterized.parameters(
         (8,),
         (128,),
@@ -91,20 +87,20 @@ def test_handles_small_norm_gracefully(self) -> None:
 
         initial_obj = self._procrustes_objective(Q)
 
-        Q = procrustes_step(Q, max_step_size=1 / 16)
+        Q = procrustes_step(Q, max_step_size=0.0625)
 
         final_obj = self._procrustes_objective(Q)
 
         self.assertLess(final_obj.item(), 1e-6)
         self.assertLess(final_obj.item(), initial_obj.item() + 1e-6)
 
     @parameterized.parameters(
-        (1 / 64,),
-        (1 / 32,),
-        (1 / 16,),
-        (1 / 8,),
+        (0.015625,),
+        (0.03125,),
+        (0.0625,),
+        (0.125,),
     )
-    def test_different_step_sizes(self, max_step_size: float) -> None:
+    def test_different_step_sizes_reduces_objective(self, max_step_size: float) -> None:
         """Test procrustes_step improvement with different step sizes."""
         perturbation = 1e-1 * torch.randn(10, 10, device=self.device, dtype=torch.float32) / math.sqrt(10)
         Q = torch.linalg.qr(torch.randn(10, 10, device=self.device, dtype=torch.float32)).Q + perturbation
@@ -122,12 +118,14 @@ def test_different_step_sizes(self, max_step_size: float) -> None:
         (512,),
         (8192,),
     )
-    def test_different_matrix_sizes(self, size: int) -> None:
+    def test_different_matrix_sizes_reduces_objective(self, size: int) -> None:
         """Test procrustes_step improvement with different matrix sizes."""
         # Create a non-orthogonal matrix by scaling an orthogonal one
         A = torch.randn(size, size, device=self.device, dtype=torch.float32)
         with fp32_matmul_precision("highest"):
             Q_orth, _ = torch.linalg.qr(A)
+        # Add perturbation, we choose 1e-2 to be small enough to not affect the objective too much
+        # but large enough to make the matrix non-orthogonal.
         Q = Q_orth + 1e-2 * torch.randn(size, size, device=self.device, dtype=torch.float32) / math.sqrt(size)
         max_step_size = 0.5 * size ** (-1 / 3)
         initial_obj = self._procrustes_objective(Q)
@@ -147,8 +145,8 @@ def test_preserves_determinant_sign_for_real_matrices(self) -> None:
         initial_det_pos = torch.det(Q_pos)
         initial_det_neg = torch.det(Q_neg)
 
-        Q_pos = procrustes_step(Q_pos, max_step_size=1 / 16)
-        Q_neg = procrustes_step(Q_neg, max_step_size=1 / 16)
+        Q_pos = procrustes_step(Q_pos, max_step_size=0.0625)
+        Q_neg = procrustes_step(Q_neg, max_step_size=0.0625)
 
         final_det_pos = torch.det(Q_pos)
         final_det_neg = torch.det(Q_neg)
@@ -159,4 +157,5 @@ def test_preserves_determinant_sign_for_real_matrices(self) -> None:
 
 
 if __name__ == "__main__":
+    torch.manual_seed(42)
     testing.absltest.main()
diff --git a/tests/test_psgd_contractions.py b/tests/test_psgd_contractions.py
@@ -13,25 +13,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
-from absl import testing
+from absl import flags, testing
 from absl.testing import parameterized
 
 from emerging_optimizers.psgd.psgd_kron_contractions import (
-    _mode_n_mul_and_permute,
+    _dim_n_mul_and_permute,
     apply_kronecker_factors,
     apply_preconditioner,
     partial_contraction,
 )
 from emerging_optimizers.utils import fp32_matmul_precision
 
 
+# Define command line flags
+flags.DEFINE_string("device", "cpu", "Device to run tests on: 'cpu' or 'cuda'")
+
+FLAGS = flags.FLAGS
+
+
 class TestPSGDKronContractions(parameterized.TestCase):
     """Test cases for PSGD Kronecker contractions."""
 
     def setUp(self) -> None:
         """Set up test fixtures."""
-        torch.manual_seed(42)
-        self.device = torch.device("cuda")
+        self.device = FLAGS.device
 
     @parameterized.parameters(
         (2, 3, 3),
@@ -111,22 +116,23 @@ def test_apply_preconditioner_matches_reconstructed(self) -> None:
         (2, 3, 5, 2),
         (4, 6, 2, 1),
     )
-    def test_mode_n_mul_and_permute_shapes(self, dim0: int, dim1: int, dim2: int, mode: int) -> None:
-        """Test `_mode_n_mul_and_permute` with non-uniform shapes and different modes."""
+    def test_dim_n_mul_and_permute__matches_shapes(self, dim0: int, dim1: int, dim2: int, contract_dim: int) -> None:
+        """Test `_dim_n_mul_and_permute` with non-uniform shapes and different contract_dim."""
         X = torch.randn(dim0, dim1, dim2, device=self.device)
         input_shape = X.shape
 
-        input_dim = input_shape[mode]
+        input_dim = input_shape[contract_dim]
         output_dim = 7  # arbitrary output dimension
         M = torch.randn(output_dim, input_dim, device=self.device)
 
-        result = _mode_n_mul_and_permute(X, M, mode)
+        result = _dim_n_mul_and_permute(X, M, contract_dim)
 
-        # Verify output shape: same as input but dimension `mode` replaced by output_dim
+        # Verify output shape: same as input but dimension `contract_dim` replaced by output_dim
         expected_shape = list(input_shape)
-        expected_shape[mode] = output_dim
+        expected_shape[contract_dim] = output_dim
         self.assertEqual(result.shape, torch.Size(expected_shape))
 
 
 if __name__ == "__main__":
+    torch.manual_seed(42)
     testing.absltest.main()
diff --git a/tests/test_psgd_utils.py b/tests/test_psgd_utils.py