added type hints for psgd

mkhona-nvidia · mkhona-nvidia · commit 7b5456549ec0 · 2025-10-03T16:56:58.000-07:00
Signed-off-by: mikail &lt;mkhona@nvidia.com&gt;
diff --git a/emerging_optimizers/psgd/procrustes_step.py b/emerging_optimizers/psgd/procrustes_step.py
@@ -23,7 +23,7 @@
 ]
 
 
-def procrustes_step(Q, max_step_size=1 / 8):
+def procrustes_step(Q: torch.Tensor, max_step_size: float = 1 / 8) -> None:
     r"""One step of an in-place online solver for the orthogonal Procrustes problem.
 
     The orthogonal Procrustes problem is min_U || U Q - I ||_F,   s.t. U^H U = I
diff --git a/tests/test_procrustes_step.py b/tests/test_procrustes_step.py
@@ -11,16 +11,16 @@
 class ProcrustesStepTest(parameterized.TestCase):
     """Test cases for procrustes_step function."""
 
-    def setUp(self):
+    def setUp(self) -> None:
         """Set up test fixtures."""
         torch.manual_seed(42)
         self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 
-    def _procrustes_objective(self, Q):
+    def _procrustes_objective(self, Q: torch.Tensor) -> torch.Tensor:
         """Helper function to compute Procrustes objective ||Q^H Q - I||_F^2."""
         return torch.linalg.matrix_norm(Q.H @ Q - torch.eye(Q.size(0), dtype=Q.dtype, device=Q.device), ord="fro") ** 2
 
-    def test_improves_orthogonality_simple_case(self):
+    def test_improves_orthogonality_simple_case(self) -> None:
         """Test that procrustes_step doesn't worsen orthogonality for a simple case."""
 
         # Make a SPD non-orthogonal matrix
@@ -35,7 +35,7 @@ def test_improves_orthogonality_simple_case(self):
 
         self.assertLessEqual(final_obj.item(), initial_obj.item() + 1e-6)
 
-    def test_modifies_matrix_in_place(self):
+    def test_modifies_matrix_in_place(self) -> None:
         """Test that procrustes_step modifies the matrix in place."""
         Q = torch.randn(3, 3, device=self.device)
         Q_original_id = id(Q)
@@ -49,7 +49,7 @@ def test_modifies_matrix_in_place(self):
         (128,),
         (1024,),
     )
-    def test_minimal_change_when_already_orthogonal(self, size):
+    def test_minimal_change_when_already_orthogonal(self, size: int) -> None:
         """Test that procrustes_step makes minimal changes to an already orthogonal matrix."""
         # Create an orthogonal matrix using QR decomposition
         A = torch.randn(size, size, device=self.device, dtype=torch.float32)
@@ -66,7 +66,7 @@ def test_minimal_change_when_already_orthogonal(self, size):
         self.assertLess(final_obj.item(), 1e-5)
         self.assertLess(final_obj.item(), initial_obj.item() + 1e-5)
 
-    def test_handles_small_norm_gracefully(self):
+    def test_handles_small_norm_gracefully(self) -> None:
         """Test that procrustes_step handles matrices with small R norm improvement."""
         # Create a matrix very close to orthogonal
         A = torch.randn(3, 3, device=self.device, dtype=torch.float32)
@@ -90,7 +90,7 @@ def test_handles_small_norm_gracefully(self):
         (1 / 16,),
         (1 / 8,),
     )
-    def test_different_step_sizes(self, max_step_size):
+    def test_different_step_sizes(self, max_step_size: float) -> None:
         """Test procrustes_step improvement with different step sizes."""
         perturbation = 1e-1 * torch.randn(10, 10, device=self.device, dtype=torch.float32) / math.sqrt(10)
         Q = torch.linalg.qr(torch.randn(10, 10, device=self.device, dtype=torch.float32)).Q + perturbation
@@ -108,7 +108,7 @@ def test_different_step_sizes(self, max_step_size):
         (512,),
         (8192,),
     )
-    def test_different_matrix_sizes(self, size):
+    def test_different_matrix_sizes(self, size: int) -> None:
         """Test procrustes_step improvement with different matrix sizes."""
         # Create a non-orthogonal matrix by scaling an orthogonal one
         A = torch.randn(size, size, device=self.device, dtype=torch.float32)
@@ -124,7 +124,7 @@ def test_different_matrix_sizes(self, size):
 
         self.assertLessEqual(final_obj.item(), initial_obj.item() + 1e-3)
 
-    def test_preserves_determinant_sign_for_real_matrices(self):
+    def test_preserves_determinant_sign_for_real_matrices(self) -> None:
         """Test that procrustes_step preserves the sign of determinant for real matrices."""
         # Create real matrices with positive and negative determinants
         Q_pos = torch.tensor([[2.0, 0.1], [0.1, 1.5]], device=self.device, dtype=torch.float32)  # det > 0
diff --git a/tests/test_psgd_contractions.py b/tests/test_psgd_contractions.py
@@ -14,7 +14,7 @@
 class TestPSGDKronContractions(parameterized.TestCase):
     """Test cases for PSGD Kronecker contractions."""
 
-    def setUp(self):
+    def setUp(self) -> None:
         """Set up test fixtures."""
         torch.manual_seed(42)
         self.device = torch.device("cuda")
@@ -24,7 +24,7 @@ def setUp(self):
         (2, 3, 4),
         (2, 3, 5),
     )
-    def test_partial_contraction_matches_reconstructed(self, size1, size2, size3):
+    def test_partial_contraction_matches_reconstructed(self, size1: int, size2: int, size3: int) -> None:
         """Test partial_contraction matches reconstructed."""
         G1 = torch.randn(size1, size2, size3, device=self.device)
         G2 = torch.randn(size1, size2, size3, device=self.device)
@@ -33,7 +33,7 @@ def test_partial_contraction_matches_reconstructed(self, size1, size2, size3):
             reconstructed = torch.tensordot(G1, G2, dims=([0, 2], [0, 2]))
         torch.testing.assert_close(result, reconstructed)
 
-    def test_apply_kronecker_factors_matches_reconstructed(self):
+    def test_apply_kronecker_factors_matches_reconstructed(self) -> None:
         """Test apply_kronecker_factors matches reconstructed."""
         Q_list = [
             torch.triu(torch.randn(2, 2, device=self.device)),
@@ -62,7 +62,7 @@ def test_apply_kronecker_factors_matches_reconstructed(self):
 
         torch.testing.assert_close(result, reconstructed)
 
-    def test_apply_preconditioner_matches_reconstructed(self):
+    def test_apply_preconditioner_matches_reconstructed(self) -> None:
         """Test apply_preconditioner matches manual reconstruction for 2D tensor."""
         Q_list = [torch.triu(torch.randn(3, 3, device=self.device)), torch.triu(torch.randn(4, 4, device=self.device))]
         X = torch.randn(3, 4, device=self.device)
@@ -97,7 +97,7 @@ def test_apply_preconditioner_matches_reconstructed(self):
         (2, 3, 5, 2),
         (4, 6, 2, 1),
     )
-    def test_mode_n_mul_and_permute_shapes(self, dim0, dim1, dim2, mode):
+    def test_mode_n_mul_and_permute_shapes(self, dim0: int, dim1: int, dim2: int, mode: int) -> None:
         """Test `_mode_n_mul_and_permute` with non-uniform shapes and different modes."""
         X = torch.randn(dim0, dim1, dim2, device=self.device)
         input_shape = X.shape
diff --git a/tests/test_psgd_utils.py b/tests/test_psgd_utils.py
@@ -12,25 +12,25 @@
 class BalanceQTest(parameterized.TestCase):
     """Test cases for balance_Q function."""
 
-    def setUp(self):
+    def setUp(self) -> None:
         """Set up test fixtures."""
         self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 
-    def test_normalization_on_empty_list(self):
+    def test_normalization_on_empty_list(self) -> None:
         """Test balance_Q with empty list."""
         Q_list = []
         balance_q_in_place(Q_list)  # Should not raise any errors
         self.assertEqual(len(Q_list), 0)
 
-    def test_normalization_on_single_tensor(self):
+    def test_normalization_on_single_tensor(self) -> None:
         """Test balance_Q with single tensor."""
         Q = torch.randn(3, 3, device=self.device)
         original_Q = Q.clone()
         balance_q_in_place([Q])
         # for a single tensor, the result should be the same as the original
         torch.testing.assert_close(Q, original_Q)
 
-    def test_normalization_on_two_tensors(self):
+    def test_normalization_on_two_tensors(self) -> None:
         """Test balance_Q with two tensors."""
         Q1 = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=self.device)
         Q2 = torch.tensor([[0.1, 0.2], [0.3, 0.4]], device=self.device)
@@ -53,7 +53,7 @@ def test_normalization_on_two_tensors(self):
         (256, 256, 256),
         (4096, 4096, 4096),
     )
-    def test_normalization_on_three_tensors(self, size1, size2, size3):
+    def test_normalization_on_three_tensors(self, size1: int, size2: int, size3: int) -> None:
         """Test balance_Q with multiple tensors of different dynamic ranges."""
         Q1 = torch.randn(size1, size1, device=self.device) * 10.0
         Q2 = torch.randn(size2, size2, device=self.device) * 0.01
@@ -76,7 +76,7 @@ def test_normalization_on_three_tensors(self, size1, size2, size3):
         self.assertAlmostEqual(new_max2.item(), expected_max.item(), places=5)
         self.assertAlmostEqual(new_max3.item(), expected_max.item(), places=5)
 
-    def test_modifies_in_place_on_three_tensors(self):
+    def test_modifies_in_place_on_three_tensors(self) -> None:
         """Test that balance_Q modifies tensors in place."""
         Q = torch.randn(3, 3, device=self.device)
         original_id = id(Q)
@@ -89,12 +89,12 @@ def test_modifies_in_place_on_three_tensors(self):
 class NormLowerBoundSpdTest(parameterized.TestCase):
     """Test cases for norm_lower_bound_spd function."""
 
-    def setUp(self):
+    def setUp(self) -> None:
         """Set up test fixtures."""
         torch.manual_seed(42)  # For reproducible tests
         self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 
-    def test_diagonal_matrix(self):
+    def test_diagonal_matrix(self) -> None:
         """Test norm_lower_bound_spd with diagonal matrix."""
         # For diagonal matrix, spectral norm equals largest diagonal entry
         diag_values = torch.tensor([1.0, 3.0, 2.0], device=self.device)
@@ -108,15 +108,15 @@ def test_diagonal_matrix(self):
         # For diagonal matrix, bound should be reasonably tight
         self.assertGreater(bound.item(), 0.5 * actual_norm.item())
 
-    def test_identity_matrix(self):
+    def test_identity_matrix(self) -> None:
         """Test norm_lower_bound_spd with identity matrix."""
         A = torch.eye(3, device=self.device)
         bound = norm_lower_bound_spd(A)
 
         # For identity matrix, spectral norm is 1
         self.assertAlmostEqual(bound.item(), 1.0, places=5)
 
-    def test_zero_matrix(self):
+    def test_zero_matrix(self) -> None:
         """Test norm_lower_bound_spd with zero matrix."""
         A = torch.zeros(3, 3, device=self.device)
         bound = norm_lower_bound_spd(A)
@@ -128,7 +128,7 @@ def test_zero_matrix(self):
         dtype=[torch.float32, torch.bfloat16],
         size=[32, 256, 4096],
     )
-    def test_norm_lower_bound_spd_is_lower_bound(self, dtype, size):
+    def test_norm_lower_bound_spd_is_lower_bound(self, dtype: torch.dtype, size: int) -> None:
         """Test that norm_lower_bound_spd provides a valid lower bound."""
         # Create a random SPD matrix
         B = torch.randn(size, size, dtype=dtype, device=self.device)
@@ -150,20 +150,20 @@ def test_norm_lower_bound_spd_is_lower_bound(self, dtype, size):
 class NormLowerBoundSkewTest(parameterized.TestCase):
     """Test cases for norm_lower_bound_skew function."""
 
-    def setUp(self):
+    def setUp(self) -> None:
         """Set up test fixtures."""
         torch.manual_seed(42)  # For reproducible tests
         self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 
-    def test_zero_matrix(self):
+    def test_zero_matrix(self) -> None:
         """Test norm_lower_bound_skew with zero matrix."""
         A = torch.zeros(3, 3, device=self.device)
         bound = norm_lower_bound_skew(A)
 
         # For zero matrix, bound should be 0
         self.assertAlmostEqual(bound.item(), 0.0, places=5)
 
-    def test_small_skew_symmetric_matrix(self):
+    def test_small_skew_symmetric_matrix(self) -> None:
         """Test norm_lower_bound_skew with a simple skew-symmetric matrix."""
         # Create a simple 3x3 skew-symmetric matrix
         A = torch.tensor([[0.0, 1.0, -2.0], [-1.0, 0.0, 3.0], [2.0, -3.0, 0.0]], device=self.device)
@@ -177,7 +177,7 @@ def test_small_skew_symmetric_matrix(self):
         # Bound should be positive for non-zero matrix
         self.assertGreater(bound.item(), 0.0)
 
-    def test_identity_based_skew_matrix(self):
+    def test_identity_based_skew_matrix(self) -> None:
         """Test norm_lower_bound_skew with matrix based on identity structure."""
         # Create skew-symmetric matrix from anti-symmetric part of random matrix
         n = 4
@@ -194,7 +194,7 @@ def test_identity_based_skew_matrix(self):
         dtype=[torch.float32, torch.float64],
         size=[32, 128, 256],
     )
-    def test_norm_lower_bound_skew_is_lower_bound(self, dtype, size):
+    def test_norm_lower_bound_skew_is_lower_bound(self, dtype: torch.dtype, size: int) -> None:
         """Test that norm_lower_bound_skew provides a valid lower bound."""
         # Create a random skew-symmetric matrix
         B = torch.randn(size, size, dtype=dtype, device=self.device)
@@ -211,7 +211,7 @@ def test_norm_lower_bound_skew_is_lower_bound(self, dtype, size):
         self.assertGreaterEqual(bound.item(), 0.0)
 
     @parameterized.parameters([4, 16, 32])
-    def test_different_subspace_dimensions(self, rank):
+    def test_different_subspace_dimensions(self, rank: int) -> None:
         """Test norm_lower_bound_skew with different subspace dimensions."""
         # Create a skew-symmetric matrix
         B = torch.randn(64, 64, device=self.device)
@@ -222,7 +222,7 @@ def test_different_subspace_dimensions(self, rank):
         self.assertGreaterEqual(bound.item(), 0.0)
 
         actual_norm = torch.linalg.matrix_norm(A, ord=2)
-        self.assertLessEqual(bound.item(), actual_norm.item() + 1e-4)
+        self.assertLessEqual(bound.item(), actual_norm.item() + 1e-5)
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`]`
`24`	`24`
`25`	`25`
`26`		`-def procrustes_step(Q, max_step_size=1 / 8):`
	`26`	`+def procrustes_step(Q: torch.Tensor, max_step_size: float = 1 / 8) -> None:`
`27`	`27`	`r"""One step of an in-place online solver for the orthogonal Procrustes problem.`
`28`	`28`
`29`	`29`	`The orthogonal Procrustes problem is min_U \|\| U Q - I \|\|_F, s.t. U^H U = I`