NVIDIA-NeMo
diff --git a/‎tests/test_soap.py‎
Lines changed: 229 additions & 0 deletions b/‎tests/test_soap.py‎
Lines changed: 229 additions & 0 deletions
@@ -12,10 +12,239 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
+from typing import Any
+
 import torch
 from absl.testing import absltest, parameterized
 
 from emerging_optimizers.soap import soap
+from emerging_optimizers.soap.soap import (
+    _clip_update_rms_in_place,
+    _get_precondition_frequency,
+    _is_eigenbasis_update_step,
+)
+from emerging_optimizers.utils.precondition_schedules import LinearSchedule
+
+
+class SoapFunctionsTest(parameterized.TestCase):
+    def test_init_preconditioner_multidim_tensor_shapes(self) -> None:
+        """Tests init_preconditioner with a multi-dimensional tensor."""
+        grad = torch.randn(3, 4, 5)
+        state: dict[str, Any] = {}
+        state["GG"] = soap.init_kronecker_factors(grad, precondition_1d=False)
+        self.assertEqual(len(state["GG"]), grad.dim())
+        self.assertEqual(state["GG"][0].shape, (3, 3))
+        self.assertEqual(state["GG"][1].shape, (4, 4))
+        self.assertEqual(state["GG"][2].shape, (5, 5))
+
+    @parameterized.parameters(
+        (1,),
+        (2,),
+        (3,),
+    )
+    def test_adam_warmup_steps(self, adam_warmup_steps: int) -> None:
+        """Tests that adam_warmup_steps causes state["Q"] to be None until the specified steps are completed."""
+
+        param = torch.randn(5, 3, requires_grad=True, device="cuda")
+
+        optimizer = soap.SOAP(
+            [param],
+            lr=0.001,
+            weight_decay=0.01,
+            adam_warmup_steps=adam_warmup_steps,
+            precondition_frequency=1,
+        )
+
+        dummy_Q = [torch.eye(shape, device=param.device) for shape in param.shape]
+        for step in range(adam_warmup_steps - 1):
+            param.grad = torch.randn_like(param)
+            optimizer.step()
+            state = optimizer.state[param]
+
+            torch.testing.assert_close(
+                state["Q"], dummy_Q, atol=0, rtol=0, msg=f"Q should stay identity at step {step}"
+            )
+
+        for step in range(adam_warmup_steps - 1, adam_warmup_steps + 3):
+            param.grad = torch.randn_like(param)
+            optimizer.step()
+            state = optimizer.state[param]
+
+            # Verify Q has the right shape (a list with tensors for each dim)
+            self.assertIsInstance(state["Q"], list)
+            self.assertEqual(len(state["Q"]), param.dim())
+            # Verify Q has the right shape (a list with square eigenvector matrices for each dim)
+            self.assertEqual(state["Q"][0].shape, (5, 5))
+            self.assertEqual(state["Q"][1].shape, (3, 3))
+
+    def test_update_kronecker_factors(self) -> None:
+        max_dim = 8
+        shampoo_beta = 0.9
+        dim0, dim1, dim2 = 3, max_dim + 2, 5
+        grad = torch.randn(dim0, dim1, dim2)
+
+        # Initialize factors
+        initial_factors = soap.init_kronecker_factors(grad, precondition_1d=False)
+
+        kronecker_factors = [f.clone() for f in initial_factors]
+
+        soap.update_kronecker_factors(
+            kronecker_factor_list=kronecker_factors,
+            grad=grad,
+            shampoo_beta=shampoo_beta,
+            precondition_1d=False,
+        )
+
+        self.assertEqual(len(kronecker_factors), grad.dim())
+
+        contract_dims_0 = [1, 2]
+        outer_product_0 = torch.tensordot(grad, grad, dims=[contract_dims_0] * 2)
+        expected_factor_0 = initial_factors[0] * shampoo_beta + outer_product_0 * (1 - shampoo_beta)
+        torch.testing.assert_close(kronecker_factors[0], expected_factor_0, atol=1e-6, rtol=1e-6)
+
+        contract_dims_2 = [0, 1]
+        outer_product_2 = torch.tensordot(grad, grad, dims=[contract_dims_2] * 2)
+        expected_factor_2 = initial_factors[2] * shampoo_beta + outer_product_2 * (1 - shampoo_beta)
+        torch.testing.assert_close(kronecker_factors[2], expected_factor_2, atol=1e-6, rtol=1e-6)
+
+    @parameterized.parameters(
+        (4, 5),
+        (3, 3),
+        (5, 4),
+    )
+    def test_tensordot_vs_matmul(self, m, n):
+        # Create tensors with random eigenvectors for rotation matrices QL and QR
+        grad = torch.randn(m, n)
+        left_matrix = torch.randn(m, m)
+        Q_L = torch.linalg.qr(left_matrix + left_matrix.T).Q
+        right_matrix = torch.randn(n, n)
+        Q_R = torch.linalg.qr(right_matrix + right_matrix.T).Q
+
+        # Test that project operation to eigenbasis is correct
+        # Calculate using sequential tensordot as used by the code
+        grad_intermediate = torch.tensordot(grad, Q_L, dims=([0], [0]))
+        # Check that grad_intermediate is transposed
+        self.assertTrue(grad_intermediate.dim() == grad.transpose(0, 1).dim())
+        grad_td = torch.tensordot(grad_intermediate, Q_R, dims=([0], [0]))
+        # Calculate using pure sequential matmul
+        grad_pt = Q_L.transpose(0, 1).matmul(grad).matmul(Q_R)
+        self.assertTrue(torch.allclose(grad_td, grad_pt, atol=1e-6))
+
+        # Test that project_back operation out of eigenbasis is correct
+        # Calculate using sequential tensordot as used by the code
+        grad_intermediate = torch.tensordot(grad, Q_L, dims=([0], [1]))
+        # Check that grad_intermediate is transposed
+        self.assertTrue(grad_intermediate.dim() == grad.transpose(0, 1).dim())
+        grad_td = torch.tensordot(grad_intermediate, Q_R, dims=([0], [1]))
+        # Calculate using pure sequential matmul
+        grad_pt = Q_L.matmul(grad).matmul(Q_R.transpose(0, 1))
+        self.assertTrue(torch.allclose(grad_td, grad_pt, atol=1e-6))
+
+    @parameterized.parameters(  # type: ignore[misc]
+        {"N": 4, "M": 8},
+        {"N": 16, "M": 8},
+        {"N": 32, "M": 8},
+    )
+    def test_project_and_project_back(self, N: int, M: int) -> None:
+        """Tests that projecting a tensor to eigenbasis of QL and QR and back
+
+        The projected tensor should approximately recover the original tensor.
+        """
+        torch.manual_seed(0)
+        # Create a random tensor to project in and out of eigenbasis
+        grad = torch.randn(M, N)
+        # Create a state with 2 orthonormal matrix.
+        Q_L = torch.linalg.qr(torch.randn(M, M))[0]
+        Q_R = torch.linalg.qr(torch.randn(N, N))[0]
+        orthonormal_matrix_list = [Q_L, Q_R]
+
+        projected = soap.precondition(
+            grad=grad,
+            eigenbasis_list=orthonormal_matrix_list,
+            dims=[[0], [0]],
+        )
+        recov = soap.precondition(
+            grad=projected,
+            eigenbasis_list=orthonormal_matrix_list,
+            dims=[[0], [1]],
+        )
+        # Check that the recovered tensor is close to the original.
+        torch.testing.assert_close(
+            grad,
+            recov,
+            atol=1e-6,
+            rtol=1e-6,
+            msg="Project and project_back did not recover the original tensor.",
+        )
+
+    def test_get_precondition_frequency_fixed(self) -> None:
+        """Test that _get_precondition_frequency works with fixed frequency (default case)."""
+        freq = _get_precondition_frequency(10, 100)
+        self.assertEqual(freq, 10)
+
+    @parameterized.parameters(
+        (5, 10, 20, 10, False),
+        (15, 10, 20, 10, True),
+        (20, 10, 15, 10, True),
+        (21, 10, 15, 10, False),
+        (30, 10, 15, 10, True),
+        (31, 10, 15, 10, False),
+    )
+    def test_is_eigenbasis_update_step_fixed_frequency(
+        self, step: int, adam_warmup_steps: int, precondition_warmup: int, precondition_frequency: int, expected: bool
+    ) -> None:
+        """Test _is_eigenbasis_update_step with fixed frequency."""
+        result = _is_eigenbasis_update_step(step, adam_warmup_steps, precondition_warmup, precondition_frequency)
+        self.assertEqual(result, expected)
+
+    def test_soap_optimizer_fixed_frequency(self) -> None:
+        """Test that SOAP optimizer can be created with fixed precondition frequency (default case)."""
+        param = torch.randn(10, 5, requires_grad=True)
+        optimizer = soap.SOAP([param], lr=1e-3, precondition_frequency=10)
+        self.assertEqual(optimizer.param_groups[0]["precondition_frequency"], 10)
+
+    def test_soap_optimizer_class_based_schedule(self) -> None:
+        """Test that SOAP optimizer can be created with class-based precondition frequency schedule."""
+        param = torch.randn(10, 5, requires_grad=True)
+        schedule = LinearSchedule(min_freq=2, max_freq=10, transition_steps=100)
+        optimizer = soap.SOAP([param], lr=1e-3, precondition_frequency=schedule)
+        self.assertTrue((optimizer.param_groups[0]["precondition_frequency"]) == schedule)
+
+        self.assertEqual(schedule(0), 2)
+        self.assertEqual(schedule(50), 6)
+        self.assertEqual(schedule(100), 10)
+
+        adam_warmup = 1
+        precondition_warmup = 0
+
+        self.assertTrue(_is_eigenbasis_update_step(10, adam_warmup, precondition_warmup, schedule))
+        self.assertFalse(_is_eigenbasis_update_step(11, adam_warmup, precondition_warmup, schedule))
+        self.assertTrue(_is_eigenbasis_update_step(60, adam_warmup, precondition_warmup, schedule))
+        self.assertFalse(_is_eigenbasis_update_step(61, adam_warmup, precondition_warmup, schedule))
+        self.assertTrue(_is_eigenbasis_update_step(120, adam_warmup, precondition_warmup, schedule))
+        self.assertFalse(_is_eigenbasis_update_step(121, adam_warmup, precondition_warmup, schedule))
+
+    @parameterized.parameters(
+        (1.0,),
+        (0.0,),
+        (0.5,),
+    )
+    def test_clip_update_rms(self, max_rms: float) -> None:
+        """Test that _clip_update_rms works by clipping the update RMS to max_rms in place."""
+        # test for 5 different u values
+        u_s = [
+            torch.tensor([4.0, -1.0, 1.0, -1.0, 1.0], device="cuda"),
+            torch.tensor([0.2, 0.2, 0.2, 0.2, 0.0], device="cuda"),
+            torch.tensor([0.8, 0.0, 0.0, 0.0, 0.0], device="cuda"),
+        ]
+        for u in u_s:
+            u_clipped = u.clone()
+            _clip_update_rms_in_place(u_clipped, max_rms=max_rms)
+            if max_rms == 0:
+                self.assertTrue(torch.linalg.norm(u_clipped) == torch.linalg.norm(u))
+            else:
+                self.assertTrue(torch.linalg.norm(u_clipped) / math.sqrt(u.numel()) <= max_rms)
 
 
 class SoapTest(parameterized.TestCase):