save unnecessary matmul (#30)

skyw · web-flow · commit 2ebdb416d665 · 2025-10-01T00:02:09.000Z
* save unnecessary matmul

Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;

* simplify criteria logic

Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;

* remove max precondition dim

Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/emerging_optimizers/soap/soap.py b/emerging_optimizers/soap/soap.py
@@ -61,7 +61,6 @@ class SOAP(optim.Optimizer):
         precondition_warmup_steps: How many steps to warm up the preconditioner (i.e. update every step)
         adam_warmup_steps: How many steps to skip preconditioning in the beginning (i.e. use standard AdamW updates)
         precondition_1d: Whether to precondition 1D gradients (like biases).
-        max_precond_dim: Maximum dimension of the preconditioner matrices. Skips preconditioning if any tensor dimension exceeds.
         trace_normalization: Whether to normalize update by the trace of the kronecker factor matrix
         normalize_preconditioned_grads: Whether to normalize preconditioned gradients per layer
         correct_bias: Whether to use bias correction in Inner Adam and Kronecker factor matrices EMA
@@ -91,7 +90,6 @@ def __init__(
         precondition_warmup_steps: int = 0,
         adam_warmup_steps: int = 1,
         precondition_1d: bool = False,
-        max_precond_dim: int = 8192,
         trace_normalization: bool = False,
         normalize_preconditioned_grads: bool = False,
         correct_bias: bool = True,
@@ -141,7 +139,6 @@ def __init__(
             "precondition_warmup_steps": precondition_warmup_steps,
             "adam_warmup_steps": adam_warmup_steps,
             "precondition_1d": precondition_1d,
-            "max_precond_dim": max_precond_dim,
             "trace_normalization": trace_normalization,
             "normalize_preconditioned_grads": normalize_preconditioned_grads,
             "use_nesterov": use_nesterov,
@@ -194,7 +191,6 @@ def step(self, closure: Callable[[], float] | None = None) -> float | None:
                     state["GG"] = init_kronecker_factors(
                         grad,
                         precondition_1d=group["precondition_1d"],
-                        max_precond_dim=group["max_precond_dim"],
                     )
 
                     # Update preconditioner matrices with gradient statistics, do not use shampoo_beta for EMA at first step
@@ -204,7 +200,6 @@ def step(self, closure: Callable[[], float] | None = None) -> float | None:
                             grad=grad,
                             shampoo_beta=0.0,
                             precondition_1d=group["precondition_1d"],
-                            max_precond_dim=group["max_precond_dim"],
                         )
 
                 # Increment step counter
@@ -284,7 +279,6 @@ def step(self, closure: Callable[[], float] | None = None) -> float | None:
                         grad=grad,
                         shampoo_beta=shampoo_beta,
                         precondition_1d=group["precondition_1d"],
-                        max_precond_dim=group["max_precond_dim"],
                     )
                 torch.cuda.nvtx.range_pop()
 
@@ -330,7 +324,6 @@ def step(self, closure: Callable[[], float] | None = None) -> float | None:
 def init_kronecker_factors(
     grad: torch.Tensor,
     precondition_1d: bool = False,
-    max_precond_dim: int = 8192,
 ) -> List[torch.Tensor]:
     """Initializes the kronecker factor matrices for the SOAP optimizer.
 
@@ -354,8 +347,6 @@ def init_kronecker_factors(
             The shape of this tensor determines the size of the kronecker factor matrices.
         precondition_1d: Whether to create kronecker factor matrices for 1D tensors
             (like biases). If False, 1D tensors will skip preconditioning.
-        max_precond_dim: Maximum dimension of the preconditioner matrices.
-            Skips preconditioning if any tensor dimension exceeds.
 
     Returns:
         List[torch.Tensor]: List of kronecker factor matrices (L and R in paper).
@@ -387,21 +378,11 @@ def init_kronecker_factors(
         else:
             # Create a square preconditioner matrix for 1D tensors
             size = grad.shape[0]
-            if size > max_precond_dim:
-                # if tensor dimension is larger than max_precond_dim, skip preconditioning this dimension
-                # append empty tensor to kronecker_factor_list so that subsequent check that use numel() to check if preconditioner is initialized will not fail
-                kronecker_factor_list.append(torch.empty(0, device=grad.device))
-            else:
-                kronecker_factor_list.append(torch.zeros(size, size, device=grad.device))
+            kronecker_factor_list.append(torch.zeros(size, size, device=grad.device))
     else:
         # Create a square kronecker factor matrix for each dimension
         for size in grad.shape:
-            if size > max_precond_dim:
-                # append empty tensor to kronecker_factor_list so that subsequent check that use numel() to check if preconditioner is initialized will not fail
-                # skip preconditioning this dimension
-                kronecker_factor_list.append(torch.empty(0, device=grad.device))
-            else:
-                kronecker_factor_list.append(torch.zeros(size, size, device=grad.device))
+            kronecker_factor_list.append(torch.zeros(size, size, device=grad.device))
 
     return kronecker_factor_list
 
@@ -412,7 +393,6 @@ def update_kronecker_factors(
     grad: torch.Tensor,
     shampoo_beta: float,
     precondition_1d: bool = False,
-    max_precond_dim: int = 8192,
 ) -> None:
     """Updates the preconditioner matrices using gradient outer products.
 
@@ -429,8 +409,6 @@ def update_kronecker_factors(
             Controls how much weight to give to new vs old gradient statistics.
         precondition_1d: Whether to apply preconditioning to 1D tensors (like biases).
             If False, 1D tensors will skip preconditioning.
-        max_precond_dim: Maximum dimension of the preconditioner matrices.
-            Skips preconditioning if any tensor dimension exceeds.
 
     Example:
         >>> grad = torch.randn(10, 20)
@@ -446,20 +424,22 @@ def update_kronecker_factors(
             kronecker_factor_list[0].lerp_(outer_product, 1 - shampoo_beta)
         else:
             # For 1D tensors, skip preconditioning
+            logging.error(
+                "1D tensor is passed to update_kronecker_factors, but precondition_1d is not set to True, skipping preconditioning."
+            )
             return
     else:
         # For higher dimensional tensors, compute outer products for each dimension
         for idx, dim_size in enumerate(grad.shape):
-            if dim_size <= max_precond_dim:
-                # Compute outer product by contracting all dimensions except idx
-                contract_dims = [*chain(range(idx), range(idx + 1, grad.dim()))]
-                outer_product = torch.tensordot(
-                    grad,
-                    grad,
-                    dims=[contract_dims] * 2,
-                )
-                # Update the corresponding Kronecker factor
-                kronecker_factor_list[idx].lerp_(outer_product, 1 - shampoo_beta)
+            # Compute outer product by contracting all dimensions except idx
+            contract_dims = [*chain(range(idx), range(idx + 1, grad.dim()))]
+            outer_product = torch.tensordot(
+                grad,
+                grad,
+                dims=[contract_dims] * 2,
+            )
+            # Update the corresponding Kronecker factor
+            kronecker_factor_list[idx].lerp_(outer_product, 1 - shampoo_beta)
 
 
 @torch.no_grad()  # type: ignore[misc]
diff --git a/emerging_optimizers/soap/soap_utils.py b/emerging_optimizers/soap/soap_utils.py
@@ -86,14 +86,11 @@ def get_eigenbasis_eigh(
                 updated_eigenbasis_list.append(torch.empty(0, device=kronecker_factor.device))
                 continue
             # Construct approximated eigenvalues using QL^T@L@QL or QR^T@R@QR.
-            # The approximated eigenvalues should be close to diagonal if the eigenbasis is close to the true eigenbasis of the kronecker factor
-            # (i.e. the approximated eigenvectors diagonalize the kronecker factor)
+            # The approximated eigenvalues should be close to diagonal if the eigenbasis is close to the true
+            # eigenbasis of the kronecker factor (i.e. the approximated eigenvectors diagonalize the kronecker factor)
             approx_eigenvalue_matrix = eigenbasis.T @ kronecker_factor @ eigenbasis
             # Update eigenbasis when necessary. Update is skipped only when adaptive update criteria is met.
-            if _adaptive_criteria_met(
-                approx_eigenvalue_matrix=approx_eigenvalue_matrix,
-                tolerance=adaptive_update_tolerance,
-            ):
+            if utils.eig.met_approx_eigvals_criteria(approx_eigenvalue_matrix, adaptive_update_tolerance):
                 _, Q = utils.eig.eigh_with_fallback(
                     kronecker_factor,
                     force_double=False,
@@ -206,21 +203,23 @@ def get_eigenbasis_qr(
         if kronecker_factor.numel() == 0:
             updated_eigenbasis_list.append(torch.empty(0, device=kronecker_factor.device))
             continue
-        # construct approximated eigenvalues using QL^T@L@QL or QR^T@R@QR, which should be close to diagonal
-        # if the eigenbasis is close to the true eigenbasis of the kronecker factor (i.e. diagonalizes it)
-        approx_eigenvalue_matrix = eigenbasis.T @ kronecker_factor @ eigenbasis
 
         # Update eigenbasis when necessary. Update is skipped only when use_adaptive_criteria is True
         # but criteria is not met.
         if_update = True
-        if use_adaptive_criteria and not _adaptive_criteria_met(
-            approx_eigenvalue_matrix=approx_eigenvalue_matrix,
-            tolerance=adaptive_update_tolerance,
-        ):
-            if_update = False
+        # construct approximated eigenvalues using QL^T@L@QL or QR^T@R@QR, which should be close to diagonal
+        # if the eigenbasis is close to the true eigenbasis of the kronecker factor (i.e. diagonalizes it)
+        if use_adaptive_criteria:
+            approx_eigenvalue_matrix = _conjugate(kronecker_factor, eigenbasis)
+            if_update = not utils.eig.met_approx_eigvals_criteria(approx_eigenvalue_matrix, adaptive_update_tolerance)
+            if if_update:
+                approx_eigvals = torch.diag(approx_eigenvalue_matrix)
+        else:
+            approx_eigvals = _conjugate(kronecker_factor, eigenbasis, diag=True)
+
         if if_update:
             Q, exp_avg_sq = _orthogonal_iteration(
-                approx_eigenvalue_matrix=approx_eigenvalue_matrix,
+                approx_eigvals=approx_eigvals,
                 kronecker_factor=kronecker_factor,
                 eigenbasis=eigenbasis,
                 ind=ind,
@@ -237,13 +236,13 @@ def get_eigenbasis_qr(
 
 
 def _orthogonal_iteration(
-    approx_eigenvalue_matrix: torch.Tensor,
+    approx_eigvals: torch.Tensor,
     kronecker_factor: torch.Tensor,
     eigenbasis: torch.Tensor,
     ind: int,
     exp_avg_sq: torch.Tensor,
-    convert_to_float: bool = True,
-    power_iter_steps: int = 1,
+    convert_to_float: bool,
+    power_iter_steps: int,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Computes the eigenbases of the preconditioner using power iteration and QR decomposition.
 
@@ -267,8 +266,6 @@ def _orthogonal_iteration(
             - Q: The updated eigenbasis
             - exp_avg_sq: The updated (sorted) inner Adam second moment
     """
-    # extract approximated eigenvalues from the diagonal of the projection of kronecker factor onto eigenbases
-    approx_eigvals = torch.diag(approx_eigenvalue_matrix)
     # Sort the approximated eigenvalues according to their magnitudes
     sort_idx = torch.argsort(approx_eigvals, descending=True)
     # re-order the inner adam second moment
@@ -292,27 +289,26 @@ def _orthogonal_iteration(
     return Q, exp_avg_sq
 
 
-@torch.compile  # type: ignore[misc]
-def _adaptive_criteria_met(
-    approx_eigenvalue_matrix: torch.Tensor,
-    tolerance: Optional[float] = None,
-) -> bool:
-    """Determines whether the eigenbasis for a factor matrix should be updated in the next step of the orthogonal iteration.
+def _conjugate(a: torch.Tensor, p: torch.Tensor, diag: bool = False) -> torch.Tensor:
+    """Calculate similarity transformation
 
-    Determines whether the eigenbasis for a factor matrix should be updated based on computing
-    the approximated eigenvalues Q^T GG Q, where Q is the approximated eigenvectors and
-    GG is the Kronecker factor. The approximated eigenvalues update criteria is then defined as
-    ||diag(Q^T GG Q)||_F >= (1 - tolerance) * (Q^T GG Q)_F.
+    This function calculates :math:`B = P^T A P`. It assumes P is orthogonal so that :math:`P^{-1} = P^T` and
+    the similarity transformation exists.
 
     Args:
-        approx_eigenvalue_matrix: Projection of kronecker factor onto the eigenbasis, should be close to diagonal
-        tolerance: Tolerance threshold for the normalized diagonal component of approximated eigenvalue matrix.
+        a: matrix to be transformed
+        p: An orthogonal matrix.
+        diag: If True, only return the diagonal of the similarity transformation
 
     Returns:
-        perform_update: Whether to update eigenbasis this iteration
+        b
     """
-    if tolerance is None:
-        return True
-
-    # check if normalized diagonal component is not smaller than tolerance
-    return not utils.eig.adaptive_early_exit_criteria(approx_eigenvalue_matrix, tolerance)
+    if a.dim() != 2 or p.dim() != 2:
+        raise TypeError("a and p must be 2D matrices")
+    pta = p.T @ a
+    if not diag:
+        b = pta @ p
+    else:
+        # return the diagonal of the similarity transformation
+        b = (pta * p.T).sum(dim=1)
+    return b
diff --git a/emerging_optimizers/utils/eig.py b/emerging_optimizers/utils/eig.py
@@ -21,7 +21,7 @@
 from emerging_optimizers import utils
 
 
-__all__ = ["eigh_with_fallback", "eig_orthogonal_iteration", "adaptive_early_exit_criteria"]
+__all__ = ["eigh_with_fallback", "eig_orthogonal_iteration", "met_approx_eigvals_criteria"]
 
 
 def eigh_with_fallback(
@@ -135,7 +135,7 @@ def eig_orthogonal_iteration(
         approx_eigenvalues_matrix = Q.T @ x @ Q
         approx_eigenvalues = torch.diag(approx_eigenvalues_matrix)
         iteration = 0
-        while iteration < max_iterations and not adaptive_early_exit_criteria(approx_eigenvalues_matrix, tolerance):
+        while iteration < max_iterations and not met_approx_eigvals_criteria(approx_eigenvalues_matrix, tolerance):
             power_iteration = x @ Q
             Q = torch.linalg.qr(power_iteration).Q
             approx_eigenvalues_matrix = Q.T @ x @ Q
@@ -148,7 +148,7 @@ def eig_orthogonal_iteration(
     return approx_eigenvalues, Q
 
 
-def adaptive_early_exit_criteria(approx_eigenvalues_matrix: Tensor, tolerance: float) -> bool:
+def met_approx_eigvals_criteria(approx_eigenvalues_matrix: Tensor, tolerance: float) -> bool:
     """Evaluates if a criteria using approximated eigenvalues is below or equal to the tolerance.
 
     `approx_eigenvalues_matrix` is a matrix created from the approximated eigenvectors and the symmetric matrix that is being eigendecomposed.
diff --git a/tests/test_soap_functions.py b/tests/test_soap_functions.py
@@ -35,28 +35,12 @@ def test_init_preconditioner_multidim_tensor_shapes(self) -> None:
         """Tests init_preconditioner with a multi-dimensional tensor."""
         grad = torch.randn(3, 4, 5)
         state: dict[str, Any] = {}
-        # No merge_dims: each dimension gets its own preconditioner unless dimension > max_precond_dim.
-        state["GG"] = init_kronecker_factors(grad, precondition_1d=False, max_precond_dim=8192)
+        state["GG"] = init_kronecker_factors(grad, precondition_1d=False)
         self.assertEqual(len(state["GG"]), grad.dim())
         self.assertEqual(state["GG"][0].shape, (3, 3))
         self.assertEqual(state["GG"][1].shape, (4, 4))
         self.assertEqual(state["GG"][2].shape, (5, 5))
 
-    def test_init_kronecker_factors_max_precond_dim(self) -> None:
-        """Tests init_kronecker_factors respects max_precond_dim."""
-        max_dim = 8
-        grad = torch.randn(3, max_dim + 2, 5)  # Second dimension exceeds max_dim
-        kronecker_factors = init_kronecker_factors(grad, precondition_1d=False, max_precond_dim=max_dim)
-
-        self.assertEqual(len(kronecker_factors), grad.dim())
-        # Dimension 0 (size 3) <= max_dim
-        self.assertEqual(kronecker_factors[0].shape, (3, 3))
-        # Dimension 1 (size max_dim + 2) > max_dim -> Should be empty
-        self.assertEqual(kronecker_factors[1].shape, (0,))
-        self.assertEqual(kronecker_factors[1].numel(), 0)
-        # Dimension 2 (size 5) <= max_dim
-        self.assertEqual(kronecker_factors[2].shape, (5, 5))
-
     @parameterized.parameters(
         (1,),
         (2,),
@@ -97,14 +81,13 @@ def test_adam_warmup_steps(self, adam_warmup_steps: int) -> None:
             self.assertEqual(state["Q"][1].shape, (3, 3))
 
     def test_update_kronecker_factors(self) -> None:
-        """Tests update_kronecker_factors, including max_precond_dim handling."""
         max_dim = 8
         shampoo_beta = 0.9
         dim0, dim1, dim2 = 3, max_dim + 2, 5
         grad = torch.randn(dim0, dim1, dim2)
 
         # Initialize factors
-        initial_factors = init_kronecker_factors(grad, precondition_1d=False, max_precond_dim=max_dim)
+        initial_factors = init_kronecker_factors(grad, precondition_1d=False)
 
         kronecker_factors = [f.clone() for f in initial_factors]
 
@@ -113,25 +96,15 @@ def test_update_kronecker_factors(self) -> None:
             grad=grad,
             shampoo_beta=shampoo_beta,
             precondition_1d=False,
-            max_precond_dim=max_dim,
         )
 
         self.assertEqual(len(kronecker_factors), grad.dim())
 
-        # Dimension 0 (size 3) <= max_dim: Should be updated
         contract_dims_0 = [1, 2]
         outer_product_0 = torch.tensordot(grad, grad, dims=[contract_dims_0] * 2)
         expected_factor_0 = initial_factors[0] * shampoo_beta + outer_product_0 * (1 - shampoo_beta)
         torch.testing.assert_close(kronecker_factors[0], expected_factor_0, atol=1e-6, rtol=1e-6)
 
-        # Dimension 1 (size 10) > max_dim: Should NOT be updated (still empty)
-        self.assertEqual(kronecker_factors[1].shape, (0,))
-        self.assertEqual(kronecker_factors[1].numel(), 0)
-
-        # Check it's the same object or has same properties as initial empty tensor
-        self.assertTrue(torch.equal(kronecker_factors[1], initial_factors[1]))
-
-        # Dimension 2 (size 5) <= max_dim: Should be updated
         contract_dims_2 = [0, 1]
         outer_product_2 = torch.tensordot(grad, grad, dims=[contract_dims_2] * 2)
         expected_factor_2 = initial_factors[2] * shampoo_beta + outer_product_2 * (1 - shampoo_beta)
diff --git a/tests/test_soap_utils.py b/tests/test_soap_utils.py