[Autotuner] Add Fp8 cuBLASLt fallback for cublas backend.

Google-ML-Automation · Google-ML-Automation · commit b170c49cbdb2 · 2025-12-02T08:12:28.000-08:00
- This is to match the current behavior in XLA, gemm-rewriter already has lots of checks to rewrite to cublasLt matmul.
- We are anyway trying to deprecate legacy cuBLAS and enable cuBLASLt.

PiperOrigin-RevId: 839273756
diff --git a/xla/backends/gpu/autotuner/BUILD b/xla/backends/gpu/autotuner/BUILD
@@ -832,7 +832,6 @@ xla_test(
         ":fission_backend",
         ":gpu_codegen_backend",
         "//xla/backends/autotuner:codegen_backend",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
diff --git a/xla/backends/gpu/autotuner/cublas.cc b/xla/backends/gpu/autotuner/cublas.cc
@@ -25,14 +25,9 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
-#include "xla/service/gpu/transforms/gemm_rewriter.h"
-#include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
@@ -49,10 +44,19 @@ namespace se = ::stream_executor;
 
 absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
 CublasBackend::GetSupportedConfigs(const HloInstruction& instr) {
-  if (!IsLegacyCublasMatmul(instr)) {
+  if (!IsSupported(instr)) {
     return std::vector<std::unique_ptr<BackendConfig>>();
   }
 
+  if (ShouldUseCublasLt(instr)) {
+    std::vector<std::unique_ptr<BackendConfig>> configs;
+    AutotuneResult::GemmKey gemm_key;
+    gemm_key.set_algorithm(0);
+    configs.push_back(std::make_unique<google::protobuf::Any>());
+    configs.back()->PackFrom(gemm_key);
+    return configs;
+  }
+
   std::unique_ptr<se::DeviceMemoryAllocator> allocator =
       std::make_unique<se::StreamExecutorMemoryAllocator>(stream_executor());
   TF_ASSIGN_OR_RETURN(
@@ -126,14 +130,16 @@ CublasBackend::GetSupportedConfigs(const HloInstruction& instr) {
 
 absl::StatusOr<std::unique_ptr<BackendConfig>> CublasBackend::GetDefaultConfig(
     const HloInstruction& instr) {
-  if (!IsLegacyCublasMatmul(instr)) {
+  if (!IsSupported(instr)) {
     return absl::InvalidArgumentError(
         "CublasBackend does not support this instruction.");
   }
-
   AutotuneResult::GemmKey gemm_key;
   gemm_key.set_algorithm(se::blas::kDefaultAlgorithm);
   auto any = std::make_unique<google::protobuf::Any>();
+  if (ShouldUseCublasLt(instr)) {
+    gemm_key.set_algorithm(0);
+  }
   any->PackFrom(gemm_key);
   return any;
 }
@@ -154,7 +160,11 @@ absl::Status CublasBackend::ApplyConfig(HloInstruction& instr,
 }
 
 bool CublasBackend::IsSupported(const HloInstruction& instr) {
-  return IsLegacyCublasMatmul(instr);
+  return IsLegacyCublasMatmul(instr) || ShouldUseCublasLt(instr);
+}
+
+bool CublasBackend::ShouldUseCublasLt(const HloInstruction& instr) {
+  return fp8_lt_fallback_ && IsCublasLtMatmulF8(instr);
 }
 
 }  // namespace gpu
diff --git a/xla/backends/gpu/autotuner/cublas.h b/xla/backends/gpu/autotuner/cublas.h
@@ -32,7 +32,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// A codegen backend for cuBLAS.
+// A codegen backend for cuBLAS, with configurable fallback to cuBLAS LT for F8
+// matmuls.
 // This backend is used to autotune cuBLAS algorithms.
 //
 // Cublas calls are represented as custom-call instructions, with and
@@ -48,9 +49,11 @@ class CublasBackend : public GpuCodegenBackend {
  public:
   explicit CublasBackend(stream_executor::StreamExecutor* stream_executor,
                          const DebugOptions* debug_options, Compiler* compiler,
-                         const Compiler::GpuTargetConfig* target_config)
+                         const Compiler::GpuTargetConfig* target_config,
+                         bool fp8_lt_fallback = false)
       : GpuCodegenBackend("Cublas", debug_options, compiler, target_config,
-                          stream_executor) {}
+                          stream_executor),
+        fp8_lt_fallback_(fp8_lt_fallback) {}
 
   absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
   GetSupportedConfigs(const HloInstruction& instr) override;
@@ -62,7 +65,10 @@ class CublasBackend : public GpuCodegenBackend {
                            const BackendConfig& config) override;
 
  private:
+  bool ShouldUseCublasLt(const HloInstruction& instr);
+
   bool IsSupported(const HloInstruction& instr) override;
+  bool fp8_lt_fallback_;
 };
 
 }  // namespace gpu
diff --git a/xla/backends/gpu/autotuner/cublas_test.cc b/xla/backends/gpu/autotuner/cublas_test.cc
@@ -44,6 +44,10 @@ namespace gpu {
 
 using CublasBackendConfig = AutotuneResult::GemmKey;
 
+using absl_testing::IsOk;
+using absl_testing::IsOkAndHolds;
+using ::testing::IsEmpty;
+using ::testing::Not;
 using ::tsl::proto_testing::EqualsProto;
 
 const char kCublasCustomCallHlo[] = R"(
@@ -68,6 +72,48 @@ const char kCublasCustomCallHlo[] = R"(
     ROOT %get-tuple-element = f32[100,100]{1,0} get-tuple-element(%custom-call.1), index=0
   })";
 
+const char kCublasLtCustomCallHlo[] = R"(
+  HloModule test, entry_computation_layout={(f8e4m3fn[16,32]{1,0}, f8e5m2[32,16]{1,0}, f32[], f32[])->f32[16,16]{1,0}}
+
+  ENTRY %test (x: f8e4m3fn[16,32], y: f8e5m2[32,16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
+    %x = f8e4m3fn[16,32]{1,0} parameter(0)
+    %y = f8e5m2[32,16]{1,0} parameter(1)
+    %transpose = f8e5m2[16,32]{1,0} transpose(%y), dimensions={1,0}
+    %x_scale = f32[] parameter(2)
+    %y_scale = f32[] parameter(3)
+    %cublas-gemm.1 = (f32[16,16]{1,0}, s8[33554432]{0}) custom-call(%x, %transpose, %x_scale, %y_scale),
+    custom_call_target="__cublas$lt$matmul$f8",
+    backend_config={
+      "operation_queue_id":"0",
+      "wait_on_operation_queues":[],
+      "gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["1"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{
+          "operand_precision":["DEFAULT","DEFAULT"],
+          "algorithm":"ALG_UNSET"
+        },
+        "epilogue":"DEFAULT",
+        "lhs_stride":"512",
+        "rhs_stride":"512",
+        "grad_x":false,
+        "grad_y":false,
+        "damax_output":false
+      },
+      "force_earliest_schedule":false,
+      "reification_cost":[],
+      "device_type":"DEVICE_TYPE_INVALID"
+    }
+    ROOT %get-tuple-element = f32[16,16]{1,0} get-tuple-element(%cublas-gemm.1), index=0
+})";
+
 const char kUnsupportedHlo[] = R"(
   HloModule module
 
@@ -122,8 +168,22 @@ TEST_F(CublasBackendTest, GetSupportedConfigsFromCublasCustomCall) {
   absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
       backend_.GetSupportedConfigs(
           (*hlo_module->entry_computation()->root_instruction()->operand(0)));
-  EXPECT_THAT(configs, absl_testing::IsOk());
-  EXPECT_GT(configs.value().size(), 0);
+  EXPECT_THAT(configs, IsOkAndHolds(Not(IsEmpty())));
+}
+
+TEST_F(CublasBackendTest, CublasLtCustomCall) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kCublasLtCustomCallHlo));
+  const HloInstruction* instr =
+      hlo_module->entry_computation()->root_instruction()->operand(0);
+  CublasBackend backend(stream_executor_, &debug_options_, &compiler_,
+                        &target_config_, /*fp8_lt_fallback=*/true);
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      backend.GetSupportedConfigs(*instr);
+  EXPECT_THAT(configs, IsOkAndHolds(Not(IsEmpty())));
+
+  EXPECT_THAT(backend.GetDefaultConfig(*instr), IsOk());
+  EXPECT_THAT(backend.Compile(*instr, *configs.value()[0]), IsOk());
 }
 
 TEST_F(CublasBackendTest,
@@ -133,7 +193,7 @@ TEST_F(CublasBackendTest,
   absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
       backend_.GetSupportedConfigs(
           (*hlo_module->entry_computation()->root_instruction()));
-  EXPECT_THAT(configs, absl_testing::IsOkAndHolds(testing::SizeIs(0)));
+  EXPECT_THAT(configs, IsOkAndHolds(testing::SizeIs(0)));
 }
 
 TEST_F(CublasBackendTest, GetDefaultConfigFromCublasCustomCall) {
@@ -162,7 +222,7 @@ TEST_F(CublasBackendTest, ApplyConfig) {
                                     any));
   EXPECT_THAT(RunFileCheck(hlo_module->ToString(),
                            "CHECK: \"selected_algorithm\":\"2\""),
-              absl_testing::IsOkAndHolds(true));
+              IsOkAndHolds(true));
 }
 
 TEST_F(CublasBackendTest, Compile) {
@@ -174,7 +234,7 @@ TEST_F(CublasBackendTest, Compile) {
           *(module->entry_computation()->root_instruction()->operand(0))));
   absl::StatusOr<std::unique_ptr<Executable>> executable = backend_.Compile(
       *(module->entry_computation()->root_instruction()), *config);
-  EXPECT_THAT(executable, absl_testing::IsOk());
+  EXPECT_THAT(executable, IsOk());
 }
 
 }  // namespace gpu
diff --git a/xla/backends/gpu/autotuner/fission_backend_test.cc b/xla/backends/gpu/autotuner/fission_backend_test.cc
@@ -74,6 +74,21 @@ const char kTritonFusionHlo[] = R"(
       backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
   })";
 
+const char kF8TritonFusionHlo[] = R"(
+HloModule o
+
+gemm_fusion {
+  p0 = f8e4m3fn[64,6144]{1,0} parameter(0)
+  p1 = f8e4m3fn[64,6144]{1,0} parameter(1)
+  ROOT %dot.0 = f32[64,64]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  p0 = f8e4m3fn[64,6144]{1,0} parameter(0)
+  p1 = f8e4m3fn[64,6144]{1,0} parameter(1)
+  ROOT %dot.0 = f32[64,64]{1,0} fusion(p0, p1), kind=kCustom, calls=gemm_fusion, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm"},"force_earliest_schedule":false}
+})";
+
 const char kUnsupportedFusionHlo[] = R"(
   HloModule module
   computation {
@@ -144,6 +159,15 @@ class FissionTest : public HloHardwareIndependentTestBase,
                                            compiler, target_config);
   }
 
+  // Static helper to create a CublasBackend.
+  static std::unique_ptr<GpuCodegenBackend> CreateCublasBackendWiithF8Fallback(
+      se::StreamExecutor* stream_executor, const DebugOptions* debug_options,
+      Compiler* compiler, const Compiler::GpuTargetConfig* target_config) {
+    return std::make_unique<CublasBackend>(stream_executor, debug_options,
+                                           compiler, target_config,
+                                           /*enable_f8_fallback=*/true);
+  }
+
   // Static helper to create a CustomKernelBackend.
   static std::unique_ptr<GpuCodegenBackend> CreateCustomKernelBackend(
       se::StreamExecutor* stream_executor, const DebugOptions* debug_options,
@@ -245,6 +269,14 @@ INSTANTIATE_TEST_SUITE_P(
          {"custom_call_target=\"__cublas$gemm\"",
           "\"selected_algorithm\":\"-1\""},
          /*expected_backend_name=*/"Cublas_fission"},
+        {"TritonFusion_CublasLt_F8",
+         kF8TritonFusionHlo,
+         &FissionTest::GetCublasRewriterPipeline,
+         &FissionTest::CreateCublasBackendWiithF8Fallback,
+         /*expected_module_substrings=*/
+         {"custom_call_target=\"__cublas$lt$matmul$f8\"",
+          "\"selected_algorithm\":\"0\""},
+         /*expected_backend_name=*/"Cublas_fission"},
         {"TritonFusion_CustomKernel",
          kTritonFusionHlo,
          &FissionTest::GetCustomKernelRewriterPipeline,