Move 2d shape check to orthogonalize() (#87)

skyw · web-flow · commit 39692907bdb6 · 2025-12-24T11:39:19.000+08:00
Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py b/emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py
@@ -147,8 +147,6 @@ def step(self, closure: Callable[[], float] | None = None) -> float | None:
 
         for group in self.param_groups:
             for p in group["params"]:
-                if p.dim() != 2:
-                    raise ValueError(f"{self.__class__.__name__} only supports 2D parameters")
                 grad = p.grad
                 if grad is None:
                     continue
@@ -195,6 +193,11 @@ def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor, **kwargs: Any) -> t
         For example, a scaled_orthogonalize_fn function can get attributes from p or from kwargs to determine if
         the parameter is a fused parameter and should be split for preconditioning.
 
+        Note:
+            N-D parameters can be supported by overriding this function. For example, convolution weight can be
+            supported by reshaping to [output_channels, input_channels * kernel_height * kernel_width], i.e. treating
+            convolution as matrix multiplication with im2col.
+
         Args:
             p: The parameter tensor. It is necessary to pass param tensor in addition to momentum because a lot of
                 information is only available in the param tensor, attributes for example. Although not used in
@@ -205,6 +208,8 @@ def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor, **kwargs: Any) -> t
         Returns:
             The orthogonalized gradient tensor.
         """
+        if grad.ndim != 2:
+            raise ValueError("Only 2D parameters are supported.")
         grad = self.scaled_orthogonalize_fn(grad)
         return grad