Improve CUTLASS GMM for llama4x pretraining fprop (#4855)

jiawenliu64 · facebook-github-bot · commit c341f82fa2cb · 2025-09-11T09:27:33.000-07:00
Summary: Pull Request resolved: #4855 X-link: facebookresearch/FBGEMM#1868 Optimize BF16 CUTLASS GMM to bring 1.1x - 1.3x speedup for llama4x pretraining fprop shapes More results can be found in this [spreadsheet](https://docs.google.com/spreadsheets/d/172Nm0F9K6XJenNFoNFqC5Sp1Ll2KhLtfOJpIfkuHDzc/edit?usp=sharing) Reviewed By: jwfromm Differential Revision: D81704026 fbshipit-source-id: 9919e05f8915c6c5db4a44d580a41ee2d997c00c
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu
@@ -152,6 +152,76 @@ get_kernel_via_heuristic(int arch, int G, int total_M, int N, int K) {
       }
     }
 
+    // Llama4.x pretraining
+    if (N == 2560 && K == 5120) {
+      if (total_M <= 256) {
+        return bf16bf16bf16_grouped_128_64_128_2_2_1_9_f;
+      } else if (total_M <= 512) {
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_9_f;
+      } else if (total_M <= 1024) {
+        return bf16bf16bf16_grouped_128_128_128_2_2_1_9_t;
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_t;
+      }
+    } else if (N == 5120 && K == 5120) {
+      if (total_M <= 256) {
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_9_f;
+      } else if (total_M <= 1024) {
+        return bf16bf16bf16_grouped_128_128_128_2_2_1_9_t;
+      } else if (total_M <= 4096) {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_t;
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_4_4_1_9_t;
+      }
+    } else if (N == 3072 && K == 6144) {
+      if (total_M <= 512) {
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_9_f;
+      } else if (total_M <= 1024) {
+        return bf16bf16bf16_grouped_128_128_128_2_2_1_9_t;
+      } else if (total_M <= 2048) {
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_9_t;
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_t;
+      }
+    } else if (N == 6144 && K == 6144) {
+      if (total_M <= 512) {
+        return bf16bf16bf16_grouped_128_128_128_4_1_1_9_f;
+      } else if (total_M <= 1024) {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_t;
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_4_4_1_9_t;
+      }
+
+    } else if (N == 5120 && K == 1280) {
+      if (total_M <= 256) {
+        return bf16bf16bf16_grouped_128_128_128_4_1_1_9_f;
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_t;
+      }
+    } else if (N == 5120 && K == 2560) {
+      if (total_M <= 256) {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_f;
+      } else if (total_M <= 1024) {
+        return bf16bf16bf16_grouped_128_128_128_2_2_1_9_t;
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_t;
+      }
+    } else if (N == 6144 && K == 1536) {
+      if (total_M <= 4096) {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_f;
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_t;
+      }
+    } else if (N == 6144 && K == 3072) {
+      if (total_M <= 256) {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_f;
+      } else if (total_M <= 4096) {
+        return bf16bf16bf16_grouped_128_128_128_1_2_1_9_t;
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_1_4_1_9_t;
+      }
+    }
+
     // Fallback to legacy heuristic for now.
     if (total_M <= 16) {
       return bf16bf16bf16_grouped_128_16_128_1_1_1_9_f;
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_1_2_1_9_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_1_2_1_9_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_1_2_1_9_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 128, 128, 1, 2, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_1_2_1_9_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      128,
+      128,
+      1,
+      2,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_1_2_1_9_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_1_2_1_9_t.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_1_2_1_9_t(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 128, 128, 1, 2, 1, true>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_1_2_1_9_t(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      128,
+      128,
+      1,
+      2,
+      1,
+      true>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_1_4_1_9_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_1_4_1_9_t.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_1_4_1_9_t(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 128, 128, 1, 4, 1, true>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_1_4_1_9_t(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      128,
+      128,
+      1,
+      4,
+      1,
+      true>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_2_2_1_9_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_2_2_1_9_t.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_2_2_1_9_t(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 128, 128, 2, 2, 1, true>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_2_2_1_9_t(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      128,
+      128,
+      2,
+      2,
+      1,
+      true>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_4_4_1_9_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_4_4_1_9_t.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_4_4_1_9_t(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 128, 128, 4, 4, 1, true>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_4_4_1_9_t(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      128,
+      128,
+      4,
+      4,
+      1,
+      true>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_64_128_2_2_1_9_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_64_128_2_2_1_9_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_64_128_2_2_1_9_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 64, 128, 2, 2, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_64_128_2_2_1_9_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      64,
+      128,
+      2,
+      2,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_manifest.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_manifest.cuh