Improve document (#33)

skyw · web-flow · commit c3c451b97cc0 · 2025-09-30T19:19:57.000-07:00
Use math equation in some of the docstrings.

Get rid off long lines. Modified some unnecessary content.

No function change.

Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/docs/apidocs/soap.md b/docs/apidocs/soap.md
@@ -21,4 +21,15 @@ emerging_optimizers.soap
 .. autofunction:: update_kronecker_factors
 
 .. autofunction:: update_eigenbasis_and_momentum
+
+emerging_optimizers.soap.soap_utils
+=====================================
+
+.. automodule:: emerging_optimizers.soap.soap_utils
+    :members:
+
+.. autofunction:: _orthogonal_iteration
+
+.. autofunction:: _conjugate
+
 ```
diff --git a/docs/conf.py b/docs/conf.py
@@ -72,6 +72,7 @@
     "numpy": ("https://numpy.org/doc/stable", None),
     "torch": ("https://pytorch.org/docs/2.5", None),
 }
+autodoc_typehints = "description"
 
 
 def linkcode_resolve(domain, info):
diff --git a/emerging_optimizers/orthogonalized_optimizers/muon.py b/emerging_optimizers/orthogonalized_optimizers/muon.py
@@ -37,9 +37,12 @@ class Muon(OrthogonalizedOptimizer):
     optimization via Frank-Wolfe.
 
     References:
-        - Jordan, K. *Muon Optimizer Implementation.* [`GitHub <https://github.com/KellerJordan/Muon/blob/master/muon.py>`_]
-        - *Modular Duality in Deep Learning.* arXiv:2410.21265 (2024). [`arXiv:2410.21265 <https://arxiv.org/abs/2410.21265>`_]
-        - *Training Deep Learning Models with Norm-Constrained LMOs.* arXiv:2502.07529 (2025). [`arXiv:2502.07529 <https://arxiv.org/abs/2502.07529>`_]
+        - Jordan, K. *Muon Optimizer Implementation.*
+          [`GitHub <https://github.com/KellerJordan/Muon/blob/master/muon.py>`_]
+        - *Modular Duality in Deep Learning.* arXiv:2410.21265 (2024).
+          [`arXiv:2410.21265 <https://arxiv.org/abs/2410.21265>`_]
+        - *Training Deep Learning Models with Norm-Constrained LMOs.* arXiv:2502.07529 (2025).
+          [`arXiv:2502.07529 <https://arxiv.org/abs/2502.07529>`_]
 
     Warning:
         - This optimizer requires that all parameters passed in are 2D.
@@ -122,7 +125,8 @@ def get_muon_scale_factor(
         # Suggested by K. Jordan and Kimi (https://arxiv.org/abs/2502.16982)
         return extra_scale_factor * max(size_out, size_in) ** 0.5
     elif mode == "unit_rms_norm":
-        # Suggested by Scion (https://arxiv.org/abs/2502.07529) and Bernstein et al. (https://jeremybernste.in/writing/deriving-muon)
+        # Suggested by Scion (https://arxiv.org/abs/2502.07529) and Bernstein et al.
+        # (https://jeremybernste.in/writing/deriving-muon)
         return extra_scale_factor * (size_out / size_in) ** 0.5
     else:
         raise ValueError(f"Invalid mode for Muon update scale factor: {mode}")
diff --git a/emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py b/emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py
@@ -45,9 +45,11 @@ class OrthogonalizedOptimizer(optim.Optimizer):
 
     - Carlson, D., Cevher, V., and Carin, L. *Stochastic spectral descent for Restricted Boltzmann Machines.*
       In International Conference on Artificial Intelligence and Statistics (2015a).
-    - Carlson, D., Hsieh, Y.-P., Collins, E., Carin, L., and Cevher, V. *Stochastic Spectral Descent for Discrete Graphical Models.*
+    - Carlson, D., Hsieh, Y.-P., Collins, E., Carin, L., and Cevher, V.
+      *Stochastic Spectral Descent for Discrete Graphical Models.*
       In IEEE Journal of Selected Topics in Signal Processing, vol. 10, no. 2, pp. 296-311 (2016).
-    - Carlson, D., Collins, E., Hsieh, Y.-P., Carin, L., and Cevher, V. *Preconditioned spectral descent for deep learning.*
+    - Carlson, D., Collins, E., Hsieh, Y.-P., Carin, L., and Cevher, V.
+      *Preconditioned spectral descent for deep learning.*
       In Neural Information Processing Systems (2015b).
     - Flynn, T. *The duality structure gradient descent algorithm: analysis and applications to neural networks.*
       arXiv preprint arXiv:1708.00523 (2017). [`arXiv:1708.00523 <https://arxiv.org/abs/1708.00523>`_]
diff --git a/emerging_optimizers/soap/soap.py b/emerging_optimizers/soap/soap.py
@@ -125,7 +125,8 @@ def __init__(
             original_adam_warmup_steps = adam_warmup_steps
             adam_warmup_steps = max(1, precondition_warmup_steps - 1)
             logging.info(
-                f"adam_warmup_steps ({original_adam_warmup_steps}) should be less than precondition_warmup_steps ({precondition_warmup_steps}). "
+                f"adam_warmup_steps ({original_adam_warmup_steps}) should be less "
+                f"than precondition_warmup_steps ({precondition_warmup_steps}). "
                 f"Setting adam_warmup_steps to {adam_warmup_steps} by default."
             )
 
@@ -193,7 +194,8 @@ def step(self, closure: Callable[[], float] | None = None) -> float | None:
                         precondition_1d=group["precondition_1d"],
                     )
 
-                    # Update preconditioner matrices with gradient statistics, do not use shampoo_beta for EMA at first step
+                    # Update preconditioner matrices with gradient statistics,
+                    # do not use shampoo_beta for EMA at first step
                     with utils.fp32_matmul_precision(group["fp32_matmul_prec"]):
                         update_kronecker_factors(
                             kronecker_factor_list=state["GG"],
@@ -282,7 +284,8 @@ def step(self, closure: Callable[[], float] | None = None) -> float | None:
                     )
                 torch.cuda.nvtx.range_pop()
 
-                # If current step is the last step to skip preconditioning, initialize eigenbases and end first order warmup
+                # If current step is the last step to skip preconditioning, initialize eigenbases and
+                # end first order warmup
                 if state["step"] == group["adam_warmup_steps"]:
                     # Obtain kronecker factor eigenbases from kronecker factor matrices using eigendecomposition
                     state["Q"] = get_eigenbasis_eigh(state["GG"])
@@ -425,7 +428,8 @@ def update_kronecker_factors(
         else:
             # For 1D tensors, skip preconditioning
             logging.error(
-                "1D tensor is passed to update_kronecker_factors, but precondition_1d is not set to True, skipping preconditioning."
+                "1D tensor is passed to update_kronecker_factors, "
+                "but precondition_1d is not set to True, skipping preconditioning."
             )
             return
     else:
@@ -586,7 +590,8 @@ def precondition(
             )
         else:
             # Permute gradient dimensions to process the next dimension in the following iteration
-            # when preconditioning for the current dimension is skipped (Q is empty), in the case of one-sided preconditioning.
+            # when preconditioning for the current dimension is skipped (Q is empty), in the case of
+            # one-sided preconditioning.
             permute_order = list(range(1, grad.dim())) + [0]
             grad = grad.permute(permute_order)
 
diff --git a/emerging_optimizers/soap/soap_utils.py b/emerging_optimizers/soap/soap_utils.py
@@ -85,7 +85,7 @@ def get_eigenbasis_eigh(
                 # We use an empty tensor so that the `precondition` function will skip this factor.
                 updated_eigenbasis_list.append(torch.empty(0, device=kronecker_factor.device))
                 continue
-            # Construct approximated eigenvalues using QL^T@L@QL or QR^T@R@QR.
+            # Construct approximated eigenvalues using :math:`Q_L^T L Q_L` or :math:`Q_R^T R Q_R`.
             # The approximated eigenvalues should be close to diagonal if the eigenbasis is close to the true
             # eigenbasis of the kronecker factor (i.e. the approximated eigenvectors diagonalize the kronecker factor)
             approx_eigenvalue_matrix = eigenbasis.T @ kronecker_factor @ eigenbasis
@@ -128,8 +128,8 @@ def get_eigenbasis_qr(
     Computes using multiple rounds of power iteration followed by QR decomposition (orthogonal iteration).
 
     Args:
-        kronecker_factor_list: List containing preconditioner (GGT and GTG)
-        eigenbasis_list: List containing eigenbases (QL and QR)
+        kronecker_factor_list: List containing preconditioner (:math:`GG^T` and :math:`G^TG`)
+        eigenbasis_list: List containing eigenbases (:math:`Q_L` and :math:`Q_R`)
         exp_avg_sq: inner adam second moment (exp_avg_sq). This tensor is modified in-place.
         convert_to_float: If True, preconditioner matrices and their corresponding
             orthonormal matrices will be cast to float. Otherwise, they are left in
@@ -207,7 +207,7 @@ def get_eigenbasis_qr(
         # Update eigenbasis when necessary. Update is skipped only when use_adaptive_criteria is True
         # but criteria is not met.
         if_update = True
-        # construct approximated eigenvalues using QL^T@L@QL or QR^T@R@QR, which should be close to diagonal
+        # construct approximated eigenvalues using :math:`Q_L^T L Q_L` or :math:`Q_R^T R Q_R`, which should be close to diagonal
         # if the eigenbasis is close to the true eigenbasis of the kronecker factor (i.e. diagonalizes it)
         if use_adaptive_criteria:
             approx_eigenvalue_matrix = _conjugate(kronecker_factor, eigenbasis)
diff --git a/emerging_optimizers/utils/eig.py b/emerging_optimizers/utils/eig.py
@@ -37,13 +37,15 @@ def eigh_with_fallback(
     Default 2nd argument of eigh UPLO is 'L'.
 
     Args:
-        x: Tensor of shape (*, n, n) where "*" is zero or more batch dimensions consisting of symmetric or Hermitian matrices.
+        x: Tensor of shape (*, n, n) where "*" is zero or more batch dimensions consisting of symmetric or
+            Hermitian matrices.
         force_double: Force double precision computation. Default False.
-        eps: Small offset for numerical stability. If None, uses dtype-appropriate values (1e-7 for float32, 1e-15 for float64). Default None.
+        eps: Small offset for numerical stability. If None, uses dtype-appropriate values (1e-7 for float32,
+            1e-15 for float64). Default None.
         output_dtype: Desired output dtype. If None, uses input dtype. Default None.
 
     Returns:
-        tuple[Tensor, Tensor]: Eigenvalues and eigenvectors tuple (eigenvalues in descending order).
+        Eigenvalues and eigenvectors tuple (eigenvalues in descending order).
     """
     input_dtype = x.dtype
     if output_dtype is None:
@@ -100,25 +102,26 @@ def eig_orthogonal_iteration(
     max_iterations: int = 1,
     tolerance: float = 0.01,
 ) -> tuple[Tensor, Tensor]:
-    """Approximately compute the eigendecomposition of a symmetric matrix by performing the orthogonal iteration algorithm.
+    """Approximately compute the eigen decomposition
 
 
-    Orthogonal or subspace iteration uses iterative power iteration and QR decomposition to update the approximated eigenvectors.
-    When the initial estimate is the zero matrix, the eigendecomposition is computed using `eigh_with_fallback`.
+    Orthogonal or subspace iteration uses iterative power iteration and QR decomposition to update the approximated
+    eigenvectors. When the initial estimate is the zero matrix, the eigendecomposition is computed
+    using `eigh_with_fallback`.
 
-    Based on Purifying Shampoo (https://www.arxiv.org/abs/2506.03595), we use an early exit criteria to stop the QR iterations.
-    This generalizes SOAP's algorithm of 1 step of power iteration for updating the eigenbasis.
+    Based on Purifying Shampoo (https://www.arxiv.org/abs/2506.03595), we use an early exit criteria to stop the
+    QR iterations. This generalizes SOAP's algorithm of 1 step of power iteration for updating the eigenbasis.
 
     Args:
         x: tensor of shape (n, n) where x is a symmetric or Hermitian matrix.
         approx_eigenvectors: The current estimate of the eigenvectors of x. If None or a zero matrix,
             falls back to using `eigh_with_fallback`.
-        max_iterations: The maximum number of iterations to perform. (Default: 1)
-        tolerance: The tolerance for determining convergence in terms of the norm of the off-diagonal elements of the approximated eigenvalues.
-            (Default: 0.01)
+        max_iterations: The maximum number of iterations to perform.
+        tolerance: The tolerance for determining convergence in terms of the norm of the off-diagonal elements
+            of the approximated eigenvalues.
 
     Returns:
-        tuple[Tensor, Tensor]: A tuple containing the approximated eigenvalues and eigenvectors matrix of the input matrix A.
+        A tuple containing the approximated eigenvalues and eigenvectors matrix of the input matrix A.
     """
 
     # Check if x is already a diagonal matrix
@@ -151,12 +154,14 @@ def eig_orthogonal_iteration(
 def met_approx_eigvals_criteria(approx_eigenvalues_matrix: Tensor, tolerance: float) -> bool:
     """Evaluates if a criteria using approximated eigenvalues is below or equal to the tolerance.
 
-    `approx_eigenvalues_matrix` is a matrix created from the approximated eigenvectors and the symmetric matrix that is being eigendecomposed.
-    We check if the ratio of the diagonal norm to the matrix norm is greater than or equal to (1 - tolerance).
+    `approx_eigenvalues_matrix` is a matrix created from the approximated eigenvectors and the symmetric matrix
+    that is being eigendecomposed. We check if the ratio of the diagonal norm to the matrix norm is greater
+    than or equal to (1 - tolerance).
 
     Args:
         approx_eigenvalues_matrix: The symmetric matrix whose eigenvalues is being eigendecomposed.
-        tolerance: The tolerance for the early exit criteria, the min relative error between diagonal norm and matrix norm of the approximated eigenvalues and the diagonal.
+        tolerance: The tolerance for the early exit criteria, the min relative error between diagonal norm
+            and matrix norm of the approximated eigenvalues and the diagonal.
 
     Returns:
         bool: True if the criteria is below or equal to the tolerance, False otherwise.
@@ -189,7 +194,7 @@ def _try_handle_diagonal_matrix(x: Tensor) -> Optional[tuple[Tensor, Tensor]]:
         x: Tensor of shape (n, n) where x is a symmetric or Hermitian matrix.
 
     Returns:
-        Optional[tuple[Tensor, Tensor]]: Sorted eigenvalues and eigenvectors if A is diagonal, None otherwise.
+        Sorted eigenvalues and eigenvectors if A is diagonal, None otherwise.
     """
     input_dtype = x.dtype
     if _is_diagonal(x):
diff --git a/tests/test_soap_functions.py b/tests/test_soap_functions.py
@@ -149,7 +149,7 @@ def test_tensordot_vs_matmul(self, m, n):
         {"N": 32, "M": 8},
     )
     def test_project_and_project_back(self, N: int, M: int) -> None:
-        """Tests that projecting a tensor to eigenbasis of QL and QR and then projecting it back results in the original tensor.
+        """Tests that projecting a tensor to eigenbasis of QL and QR and back
 
         The projected tensor should approximately recover the original tensor.
         """

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@`
`72`	`72`	`"numpy": ("https://numpy.org/doc/stable", None),`
`73`	`73`	`"torch": ("https://pytorch.org/docs/2.5", None),`
`74`	`74`	`}`
	`75`	`+autodoc_typehints = "description"`
`75`	`76`
`76`	`77`
`77`	`78`	`def linkcode_resolve(domain, info):`
Original file line number	Diff line number	Diff line change
`@@ -149,7 +149,7 @@ def test_tensordot_vs_matmul(self, m, n):`
`149`	`149`	`{"N": 32, "M": 8},`
`150`	`150`	`)`
`151`	`151`	`def test_project_and_project_back(self, N: int, M: int) -> None:`
`152`		`- """Tests that projecting a tensor to eigenbasis of QL and QR and then projecting it back results in the original tensor.`
	`152`	`+ """Tests that projecting a tensor to eigenbasis of QL and QR and back`
`153`	`153`
`154`	`154`	`The projected tensor should approximately recover the original tensor.`
`155`	`155`	`"""`