Save some binary size (#4900)

cthi · facebook-github-bot · commit de1da636749e · 2025-09-19T17:17:58.000-07:00
Summary: Pull Request resolved: #4900 X-link: facebookresearch/FBGEMM#1927 We need to urgently trim down the fbgemm binary size, as new kernels being added are running into relocation issues. Long/mid term, we will need to split up gemm into per-op buck targets and update targets to only pull in what is required, as it's becoming unwieldy to include all gemm targets (quantize_ops), and will likely continue to cause problems in fbcode usage, especially since many targets (e.g. torchAO, Sigrid predictor (MRS + Ads) are pulling in fbgemm now. For now, let's do some house keeping to trim down the lib size. [FP8 Batched GEMM](https://www.internalfb.com/code/search?q=repo%3Afbcode%20torch.ops.fbgemm.f8f8bf16_rowwise_batched&leading_context=5&trailing_context=5): - Remove `fast_accum=False`, as no one uses it. - Only support fp32 bias, remove bf16 bias. Bias itself is unused except for unit test. - Significantly reduce FP8 e5m2 to only a single kernel instance. Its highly unlikely this is used, but hard for us to validate confidently right now. FP8 Int4 Mixed precision GEMM (I beliee this kernel was purely exploratory, and should be unused): - Remove FP8 e5m2 completely Reviewed By: jiawenliu64, q10 Differential Revision: D82842915 fbshipit-source-id: a483080d319aae5b7f24492db4a1403ebf0fab85
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched.cu
@@ -29,7 +29,6 @@ at::Tensor dispatch_fp8_rowwise_batched_kernel(
     at::Tensor WQ, // FP8
     at::Tensor x_scale, // FP32
     at::Tensor w_scale, // FP32
-    bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
   const int arch = getDeviceArch();
@@ -41,6 +40,14 @@ at::Tensor dispatch_fp8_rowwise_batched_kernel(
   M = XQ.size(1);
   N = WQ.size(1);
 
+  const bool use_e5m2 = XQ.dtype() == at::kFloat8_e5m2;
+  if (use_e5m2) {
+    TORCH_CHECK(
+        arch == 9, "f8f8bf16_rowwise_batched only supports FP8 e5m2 on SM90");
+    return f8f8bf16_rowwise_batched_64_128_128_2_1_1_9_f_e5m2(
+        XQ, WQ, x_scale, w_scale, bias, output);
+  }
+
   if (arch == 10) {
     if ((M * N <= 4096 * 4096) || (N % 256 > 0 && M % 256 == 0) ||
         (M % 256 > 0 && N % 256 > 0) || M >= 1024 && N >= 1024) {
@@ -49,21 +56,21 @@ at::Tensor dispatch_fp8_rowwise_batched_kernel(
               cute::size(
                   cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>{})) {
         return f8f8bf16_rowwise_batched_64_128_128_2_1_1_10_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+            XQ, WQ, x_scale, w_scale, bias, output);
       } else {
         return f8f8bf16_rowwise_batched_128_128_128_2_1_1_10_t(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+            XQ, WQ, x_scale, w_scale, bias, output);
       }
     } else {
       if ((ceildiv(M, 64 * 2) * ceildiv(N, 128 * 1)) <=
           kNumSMsForGB200 /
               cute::size(
                   cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>{})) {
         return f8f8bf16_rowwise_batched_64_128_128_1_2_1_10_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+            XQ, WQ, x_scale, w_scale, bias, output);
       } else {
         return f8f8bf16_rowwise_batched_128_128_128_1_2_1_10_t(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+            XQ, WQ, x_scale, w_scale, bias, output);
       }
     }
   } else {
@@ -74,21 +81,21 @@ at::Tensor dispatch_fp8_rowwise_batched_kernel(
               cute::size(
                   cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>{})) {
         return f8f8bf16_rowwise_batched_64_128_128_2_1_1_9_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+            XQ, WQ, x_scale, w_scale, bias, output);
       } else {
         return f8f8bf16_rowwise_batched_128_128_128_2_1_1_9_t(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+            XQ, WQ, x_scale, w_scale, bias, output);
       }
     } else {
       if ((ceildiv(M, 64 * 2) * ceildiv(N, 128 * 1)) <=
           kNumSMsForGB200 /
               cute::size(
                   cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>{})) {
         return f8f8bf16_rowwise_batched_64_128_128_1_2_1_9_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+            XQ, WQ, x_scale, w_scale, bias, output);
       } else {
         return f8f8bf16_rowwise_batched_128_128_128_1_2_1_9_t(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+            XQ, WQ, x_scale, w_scale, bias, output);
       }
     }
   }
@@ -102,8 +109,10 @@ at::Tensor f8f8bf16_rowwise_batched(
     std::optional<at::Tensor> bias = std::nullopt,
     bool use_fast_accum = true,
     std::optional<at::Tensor> output = std::nullopt) {
+  TORCH_CHECK(
+      use_fast_accum, "f8f8bf16_rowwise_batched only supports fast_accum=True");
   return dispatch_fp8_rowwise_batched_kernel(
-      XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+      XQ, WQ, x_scale, w_scale, bias, output);
 }
 
 #else
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_128_128_128_1_2_1_10_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_128_128_128_1_2_1_10_t.cu
@@ -15,12 +15,19 @@ at::Tensor f8f8bf16_rowwise_batched_128_128_128_1_2_1_10_t(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return f8f8bf16_rowwise_batched_wrapper<128, 128, 128, 1, 2, 1, 10, true>(
-      XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+  return f8f8bf16_rowwise_batched_wrapper<
+      128,
+      128,
+      128,
+      1,
+      2,
+      1,
+      10,
+      true,
+      cutlass::float_e4m3_t>(XQ, WQ, x_scale, w_scale, bias, output);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_128_128_128_1_2_1_9_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_128_128_128_1_2_1_9_t.cu
@@ -15,12 +15,19 @@ at::Tensor f8f8bf16_rowwise_batched_128_128_128_1_2_1_9_t(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return f8f8bf16_rowwise_batched_wrapper<128, 128, 128, 1, 2, 1, 9, true>(
-      XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+  return f8f8bf16_rowwise_batched_wrapper<
+      128,
+      128,
+      128,
+      1,
+      2,
+      1,
+      9,
+      true,
+      cutlass::float_e4m3_t>(XQ, WQ, x_scale, w_scale, bias, output);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_128_128_128_2_1_1_10_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_128_128_128_2_1_1_10_t.cu
@@ -15,12 +15,19 @@ at::Tensor f8f8bf16_rowwise_batched_128_128_128_2_1_1_10_t(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return f8f8bf16_rowwise_batched_wrapper<128, 128, 128, 2, 1, 1, 10, true>(
-      XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+  return f8f8bf16_rowwise_batched_wrapper<
+      128,
+      128,
+      128,
+      2,
+      1,
+      1,
+      10,
+      true,
+      cutlass::float_e4m3_t>(XQ, WQ, x_scale, w_scale, bias, output);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_128_128_128_2_1_1_9_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_128_128_128_2_1_1_9_t.cu
@@ -15,12 +15,19 @@ at::Tensor f8f8bf16_rowwise_batched_128_128_128_2_1_1_9_t(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return f8f8bf16_rowwise_batched_wrapper<128, 128, 128, 2, 1, 1, 9, true>(
-      XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+  return f8f8bf16_rowwise_batched_wrapper<
+      128,
+      128,
+      128,
+      2,
+      1,
+      1,
+      9,
+      true,
+      cutlass::float_e4m3_t>(XQ, WQ, x_scale, w_scale, bias, output);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_1_2_1_10_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_1_2_1_10_f.cu
@@ -15,12 +15,19 @@ at::Tensor f8f8bf16_rowwise_batched_64_128_128_1_2_1_10_f(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return f8f8bf16_rowwise_batched_wrapper<64, 128, 128, 1, 2, 1, 10, false>(
-      XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+  return f8f8bf16_rowwise_batched_wrapper<
+      64,
+      128,
+      128,
+      1,
+      2,
+      1,
+      10,
+      false,
+      cutlass::float_e4m3_t>(XQ, WQ, x_scale, w_scale, bias, output);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_1_2_1_9_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_1_2_1_9_f.cu
@@ -15,12 +15,19 @@ at::Tensor f8f8bf16_rowwise_batched_64_128_128_1_2_1_9_f(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return f8f8bf16_rowwise_batched_wrapper<64, 128, 128, 1, 2, 1, 9, false>(
-      XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+  return f8f8bf16_rowwise_batched_wrapper<
+      64,
+      128,
+      128,
+      1,
+      2,
+      1,
+      9,
+      false,
+      cutlass::float_e4m3_t>(XQ, WQ, x_scale, w_scale, bias, output);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_2_1_1_10_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_2_1_1_10_f.cu
@@ -15,12 +15,19 @@ at::Tensor f8f8bf16_rowwise_batched_64_128_128_2_1_1_10_f(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return f8f8bf16_rowwise_batched_wrapper<64, 128, 128, 2, 1, 1, 10, false>(
-      XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+  return f8f8bf16_rowwise_batched_wrapper<
+      64,
+      128,
+      128,
+      2,
+      1,
+      1,
+      10,
+      false,
+      cutlass::float_e4m3_t>(XQ, WQ, x_scale, w_scale, bias, output);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_2_1_1_9_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_2_1_1_9_f.cu
@@ -15,12 +15,19 @@ at::Tensor f8f8bf16_rowwise_batched_64_128_128_2_1_1_9_f(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return f8f8bf16_rowwise_batched_wrapper<64, 128, 128, 2, 1, 1, 9, false>(
-      XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+  return f8f8bf16_rowwise_batched_wrapper<
+      64,
+      128,
+      128,
+      2,
+      1,
+      1,
+      9,
+      false,
+      cutlass::float_e4m3_t>(XQ, WQ, x_scale, w_scale, bias, output);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_2_1_1_9_f_e5m2.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_64_128_128_2_1_1_9_f_e5m2.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f8f8bf16_rowwise_batched_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor f8f8bf16_rowwise_batched_64_128_128_2_1_1_9_f_e5m2(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    std::optional<at::Tensor> bias = std::nullopt,
+    std::optional<at::Tensor> output = std::nullopt) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return f8f8bf16_rowwise_batched_wrapper<
+      64,
+      128,
+      128,
+      2,
+      1,
+      1,
+      9,
+      false,
+      cutlass::float_e5m2_t>(XQ, WQ, x_scale, w_scale, bias, output);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_common.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_common.cuh
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_manifest.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_manifest.cuh
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_rowwise.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_rowwise.cu
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py