From 69c796564ec2d57dbd3e4a92c747cf1f4b4179e8 Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Fri, 7 Nov 2025 19:36:55 -0800 Subject: [PATCH 1/6] Update sphinx doc Signed-off-by: Hao Wu --- README.md | 13 +++++++------ docs/apidocs/index.md | 3 ++- docs/apidocs/orthogonalized-optimizers.md | 7 +++++++ docs/apidocs/soap.md | 2 ++ docs/index.md | 4 ++-- emerging_optimizers/mixin.py | 1 + .../orthogonalized_optimizers/__init__.py | 1 + 7 files changed, 22 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index f5f5532..02ab6f5 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@
- [![codecov](https://codecov.io/gh/NVIDIA-NeMo/Emerging-Optimizers/graph/badge.svg?token=IQ6U7IFYN0)](https://codecov.io/gh/NVIDIA-NeMo/Emerging-Optimizers) [![CICD NeMo](https://github.com/NVIDIA-NeMo/Emerging-Optimizers/actions/workflows/cicd-main.yml/badge.svg?branch=main)](https://github.com/NVIDIA-NeMo/Emerging-Optimizers/actions/workflows/cicd-main.yml) @@ -14,6 +13,10 @@
+| **`Documentation`** | +| ------------------------------------------------------------ | +| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://docs.nvidia.com/nemo/emerging-optimizers/latest/index.html) | + ## Overview Emerging Optimizers is a research project focused on understanding and optimizing the algorithmic behavior of emerging optimizers (including Shampoo, SOAP, Muon, and others) and their implications to performance of GPU systems in LLM training. @@ -53,15 +56,13 @@ pip install . ## Usage -### Muon Optimizer - -Muon (MomentUm Orthogonalized by Newton-schulz) uses orthogonalization for 2D parameters. +### Example -For a simple usage example, see [`tests/test_orthogonalized_optimizer.py::MuonTest`](tests/test_orthogonalized_optimizer.py). +Refer to tests for usage of different optimizers, e.g. [`tests/test_orthogonalized_optimizer.py::MuonTest`](tests/test_orthogonalized_optimizer.py). ### Integration with Megatron Core -Integration with Megatron Core is in progress. See the [integration PR](https://github.com/NVIDIA/Megatron-LM/pull/1813) that demonstrates usage with Dense and MoE models. +Integration with Megatron Core is is available in **dev** branch, e.g. [muon.py](https://github.com/NVIDIA/Megatron-LM/blob/dev/megatron/core/optimizer/muon.py) ## Benchmarks diff --git a/docs/apidocs/index.md b/docs/apidocs/index.md index ccd7a57..8f5391e 100644 --- a/docs/apidocs/index.md +++ b/docs/apidocs/index.md @@ -6,10 +6,11 @@ NeMo Emerging Optimizers API reference provides comprehensive technical document :caption: API Documentation :hidden: -utils.md orthogonalized-optimizers.md soap.md riemannian-optimizers.md psgd.md scalar-optimizers.md +mixin.md +utils.md ``` \ No newline at end of file diff --git a/docs/apidocs/orthogonalized-optimizers.md b/docs/apidocs/orthogonalized-optimizers.md index 2fb2b77..388ef48 100644 --- a/docs/apidocs/orthogonalized-optimizers.md +++ b/docs/apidocs/orthogonalized-optimizers.md @@ -21,6 +21,13 @@ emerging_optimizers.orthogonalized_optimizers :members: +:hidden:`Scion` +~~~~~~~~~~~~~~~ + +.. autoclass:: Scion + :members: + + :hidden:`Newton-Schulz` ~~~~~~~~~~~~~~~~~~~~~~~~ .. automodule:: emerging_optimizers.orthogonalized_optimizers.muon_utils diff --git a/docs/apidocs/soap.md b/docs/apidocs/soap.md index 6dcf3bc..dc107d0 100644 --- a/docs/apidocs/soap.md +++ b/docs/apidocs/soap.md @@ -20,6 +20,8 @@ emerging_optimizers.soap .. autofunction:: update_kronecker_factors +.. autofunction:: update_kronecker_factors_kl_shampoo + .. autofunction:: update_eigenbasis_and_momentum emerging_optimizers.soap.soap_utils diff --git a/docs/index.md b/docs/index.md index 8cc66c6..d90b53b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -12,7 +12,7 @@ Emerging-Optimizers is under active development. All APIs are experimental and s ### Prerequisites -- Python 3.12 or higher +- Python 3.10 or higher, 3.12 is recommended - PyTorch 2.0 or higher ### Install from Source @@ -33,8 +33,8 @@ Coming soon. :caption: 🛠️ Development :hidden: -documentation.md apidocs/index.md +documentation.md ``` diff --git a/emerging_optimizers/mixin.py b/emerging_optimizers/mixin.py index 508166e..4ad2dc6 100644 --- a/emerging_optimizers/mixin.py +++ b/emerging_optimizers/mixin.py @@ -25,6 +25,7 @@ class WeightDecayMixin: """Mixin for weight decay Supports different types of weight decay: + - "decoupled": weight decay is applied directly to params without changing gradients - "independent": similar as decoupled weight decay, but without tying weight decay and learning rate - "l2": classic L2 regularization diff --git a/emerging_optimizers/orthogonalized_optimizers/__init__.py b/emerging_optimizers/orthogonalized_optimizers/__init__.py index 8b8f9a4..c809ebb 100644 --- a/emerging_optimizers/orthogonalized_optimizers/__init__.py +++ b/emerging_optimizers/orthogonalized_optimizers/__init__.py @@ -14,4 +14,5 @@ # limitations under the License. from emerging_optimizers.orthogonalized_optimizers.muon import * from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import * +from emerging_optimizers.orthogonalized_optimizers.scion import * from emerging_optimizers.orthogonalized_optimizers.spectral_clipping_utils import * From fc748af4cfeba4c2bbbd9deebf0fd545b699730e Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Fri, 7 Nov 2025 19:39:16 -0800 Subject: [PATCH 2/6] rollback accidental change Signed-off-by: Hao Wu --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 02ab6f5..69dfd8b 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@
+ [![codecov](https://codecov.io/gh/NVIDIA-NeMo/Emerging-Optimizers/graph/badge.svg?token=IQ6U7IFYN0)](https://codecov.io/gh/NVIDIA-NeMo/Emerging-Optimizers) [![CICD NeMo](https://github.com/NVIDIA-NeMo/Emerging-Optimizers/actions/workflows/cicd-main.yml/badge.svg?branch=main)](https://github.com/NVIDIA-NeMo/Emerging-Optimizers/actions/workflows/cicd-main.yml) From 297d807431cba0beeccf8e5a147717530155dcd9 Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Fri, 7 Nov 2025 19:41:13 -0800 Subject: [PATCH 3/6] update badge for doc Signed-off-by: Hao Wu --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 69dfd8b..22b45ee 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@
+[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://docs.nvidia.com/nemo/emerging-optimizers/latest/index.html) [![codecov](https://codecov.io/gh/NVIDIA-NeMo/Emerging-Optimizers/graph/badge.svg?token=IQ6U7IFYN0)](https://codecov.io/gh/NVIDIA-NeMo/Emerging-Optimizers) [![CICD NeMo](https://github.com/NVIDIA-NeMo/Emerging-Optimizers/actions/workflows/cicd-main.yml/badge.svg?branch=main)](https://github.com/NVIDIA-NeMo/Emerging-Optimizers/actions/workflows/cicd-main.yml) @@ -14,10 +15,6 @@
-| **`Documentation`** | -| ------------------------------------------------------------ | -| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://docs.nvidia.com/nemo/emerging-optimizers/latest/index.html) | - ## Overview Emerging Optimizers is a research project focused on understanding and optimizing the algorithmic behavior of emerging optimizers (including Shampoo, SOAP, Muon, and others) and their implications to performance of GPU systems in LLM training. From 05d94c34f4fe0ddd47f4f52b0d3f154e199ce112 Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Fri, 7 Nov 2025 19:41:59 -0800 Subject: [PATCH 4/6] shuffle badges around Signed-off-by: Hao Wu --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 22b45ee..f52f301 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,12 @@
-[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://docs.nvidia.com/nemo/emerging-optimizers/latest/index.html) [![codecov](https://codecov.io/gh/NVIDIA-NeMo/Emerging-Optimizers/graph/badge.svg?token=IQ6U7IFYN0)](https://codecov.io/gh/NVIDIA-NeMo/Emerging-Optimizers) [![CICD NeMo](https://github.com/NVIDIA-NeMo/Emerging-Optimizers/actions/workflows/cicd-main.yml/badge.svg?branch=main)](https://github.com/NVIDIA-NeMo/Emerging-Optimizers/actions/workflows/cicd-main.yml) [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/) ![GitHub Repo stars](https://img.shields.io/github/stars/NVIDIA-NeMo/Emerging-Optimizers) +[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://docs.nvidia.com/nemo/emerging-optimizers/latest/index.html)
From d7e7d144a015fabf02795a2a8cc9741878382c66 Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Fri, 7 Nov 2025 19:42:37 -0800 Subject: [PATCH 5/6] fix typo Signed-off-by: Hao Wu --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f52f301..ccb74bb 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Refer to tests for usage of different optimizers, e.g. [`tests/test_orthogonali ### Integration with Megatron Core -Integration with Megatron Core is is available in **dev** branch, e.g. [muon.py](https://github.com/NVIDIA/Megatron-LM/blob/dev/megatron/core/optimizer/muon.py) +Integration with Megatron Core is available in **dev** branch, e.g. [muon.py](https://github.com/NVIDIA/Megatron-LM/blob/dev/megatron/core/optimizer/muon.py) ## Benchmarks From 420bb64e1509c4e7c2d5cf6ec08cc721e5cce7d1 Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Fri, 7 Nov 2025 19:51:47 -0800 Subject: [PATCH 6/6] add doc for mixin Signed-off-by: Hao Wu --- docs/apidocs/mixin.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 docs/apidocs/mixin.md diff --git a/docs/apidocs/mixin.md b/docs/apidocs/mixin.md new file mode 100644 index 0000000..77af82b --- /dev/null +++ b/docs/apidocs/mixin.md @@ -0,0 +1,12 @@ + +```{eval-rst} +.. role:: hidden + :class: hidden-section + +emerging_optimizers.mixin +========================== + +.. automodule:: emerging_optimizers.mixin + :members: + :private-members: +``` \ No newline at end of file