fix fp8 gemm bug

yeyu-nvidia · yeyu-nvidia · commit b4cb2d55ca17 · 2025-09-09T09:20:01.000-07:00
Signed-off-by: Ye Yu &lt;yeyu@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py b/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py
@@ -175,7 +175,9 @@ def backward(ctx, grad_outputs):
                 weight = weight.dequantize(scale=scale, block_sizes=ctx.block_sizes)
             grad_input = grad_outputs @ weight
         if input_tensor is not None:
-            grad_weight = grad_outputs.transpose(-2, 1) @ input_tensor
+            grad_weight = grad_outputs.reshape(-1, grad_outputs.shape[-1]).T @ input_tensor.reshape(
+                -1, input_tensor.shape[-1]
+            )
         if compute_bias_grad is not None:
             # Sum all dimensions except the last one
             grad_bias = grad_outputs.sum(dim=list(range(grad_outputs.dim() - 1)))