update credits for orthogonalized optimizer and scaling in muon (#18)

mkhona-nvidia · web-flow · commit 5889b64b924d · 2025-09-24T19:40:28.000-07:00
Signed-off-by: mikail &lt;mkhona@nvidia.com&gt;
diff --git a/emerging_optimizers/orthogonalized_optimizers/muon.py b/emerging_optimizers/orthogonalized_optimizers/muon.py
@@ -119,10 +119,10 @@ def get_muon_scale_factor(
         # Suggested by Muon (https://kellerjordan.github.io/posts/muon/)
         return extra_scale_factor * max(1, size_out / size_in) ** 0.5
     elif mode == "spectral":
-        # Suggested by Scion (https://arxiv.org/abs/2502.07529) and Kimi (https://arxiv.org/abs/2502.16982)
+        # Suggested by K. Jordan and Kimi (https://arxiv.org/abs/2502.16982)
         return extra_scale_factor * max(size_out, size_in) ** 0.5
     elif mode == "unit_rms_norm":
-        # Suggested by Bernstein et al. (https://jeremybernste.in/writing/deriving-muon)
+        # Suggested by Scion (https://arxiv.org/abs/2502.07529) and Bernstein et al. (https://jeremybernste.in/writing/deriving-muon)
         return extra_scale_factor * (size_out / size_in) ** 0.5
     else:
         raise ValueError(f"Invalid mode for Muon update scale factor: {mode}")
diff --git a/emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py b/emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py
@@ -45,6 +45,8 @@ class OrthogonalizedOptimizer(optim.Optimizer):
 
     - Carlson, D., Cevher, V., and Carin, L. *Stochastic spectral descent for Restricted Boltzmann Machines.*
       In International Conference on Artificial Intelligence and Statistics (2015a).
+    - Carlson, D., Hsieh, Y.-P., Collins, E., Carin, L., and Cevher, V. *Stochastic Spectral Descent for Discrete Graphical Models.*
+      In IEEE Journal of Selected Topics in Signal Processing, vol. 10, no. 2, pp. 296-311 (2016).
     - Carlson, D., Collins, E., Hsieh, Y.-P., Carin, L., and Cevher, V. *Preconditioned spectral descent for deep learning.*
       In Neural Information Processing Systems (2015b).
     - Flynn, T. *The duality structure gradient descent algorithm: analysis and applications to neural networks.*