update docstring

skyw · skyw · commit 05e54a6c0bcf · 2025-11-20T19:06:47.000-08:00
Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/docs/apidocs/utils.md b/docs/apidocs/utils.md
@@ -14,4 +14,10 @@ emerging_optimizers.utils.eig
 =============================
 .. automodule:: emerging_optimizers.utils.eig
     :members:
+
+
+emerging_optimizers.utils.modules
+=================================
+.. automodule:: emerging_optimizers.utils.modules
+    :members:
 ```
diff --git a/emerging_optimizers/utils/modules.py b/emerging_optimizers/utils/modules.py
@@ -22,7 +22,24 @@
 
 
 class Conv1dFlatWeights(nn.Conv1d):
-    """Conv1d with weights+bias stored in a single 2D tensor"""
+    """Conv1d with weights+bias stored in a single 2D tensor
+
+    There are conv1d used in some LLM, in mamba mixer for example. Because the weight is not 2d, we cannot apply
+    many of the emerging optimizers originally introduced for 2d weights of Linear layers without bias. Since
+    convolution can be viewed as a matrix multiplication with im2col (either implicit or explicit), we can flatten
+    the weight into a single 2D tensor and then apply the emerging optimizers to it.
+
+    Bias is not commonly used in most LLM's anymore, but they are often included in this type of conv1d.
+    Since bias is mathematically the 0 order term of the polynomial, we can combine weight and bias into a
+    single 2D tensor.
+
+    Arguments are the same as ::class:`torch.nn.Conv1d`.
+
+    Note:
+        Similar flattening logic can be applied to N-D convolution. But since we don't have use cases of them in LLM
+        yet, they are not supported despite the __init__() function is generalized enough to support N-D convolution.
+
+    """
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
@@ -37,8 +54,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
             flat_weight_shape[1] += 1
         flat_weight_buffer = torch.empty(flat_weight_shape, device=self.weight.device, dtype=self.weight.dtype)
         if self.bias is not None:
-            flat_weight_buffer[:, :-1].copy_(self.weight.view(self.out_channels, -1))
-            flat_weight_buffer[:, -1].copy_(self.bias)
+            flat_weight_buffer[..., :-1].copy_(self.weight.view(self.out_channels, -1))
+            flat_weight_buffer[..., -1].copy_(self.bias)
             del self.bias
             self.has_bias = True
             self.bias = "dummy"  # Trick con1d.extra_repr() to not print bias=False
@@ -66,8 +83,8 @@ def from_conv1d(cls, conv1d: nn.Conv1d) -> Self:
         )
 
         if conv1d.bias is not None:
-            conv1d_flat.weight.data[:, :-1].copy_(conv1d.weight.data.view(conv1d.out_channels, -1))
-            conv1d_flat.weight.data[:, -1].copy_(conv1d.bias.data)
+            conv1d_flat.weight.data[..., :-1].copy_(conv1d.weight.data.view(conv1d.out_channels, -1))
+            conv1d_flat.weight.data[..., -1].copy_(conv1d.bias.data)
         else:
             conv1d_flat.weight.data.copy_(conv1d.weight.data.view(conv1d.out_channels, -1))
         return conv1d_flat
@@ -78,8 +95,8 @@ def weight_shape(self) -> tuple[int, int, int]:
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.has_bias:
-            weight = self.weight[:, :-1].view(self.weight_shape)
-            bias = self.weight[:, -1]
+            weight = self.weight[..., :-1].view(self.weight_shape)
+            bias = self.weight[..., -1]
         else:
             weight = self.weight.view(self.weight_shape)
             bias = None