openxla
diff --git a/‎xla/backends/gpu/autotuner/BUILD‎
Lines changed: 150 additions & 1 deletion b/‎xla/backends/gpu/autotuner/BUILD‎
Lines changed: 150 additions & 1 deletion
diff --git a/‎xla/backends/gpu/autotuner/factory_rocm.cc‎
Lines changed: 5 additions & 5 deletions b/‎xla/backends/gpu/autotuner/factory_rocm.cc‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎xla/backends/gpu/autotuner/hipblaslt.cc‎
Lines changed: 172 additions & 0 deletions b/‎xla/backends/gpu/autotuner/hipblaslt.cc‎
Lines changed: 172 additions & 0 deletions
@@ -630,6 +630,155 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "rocblas",
+    srcs = ["rocblas.cc"],
+    hdrs = ["rocblas.h"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla:autotuning_proto_cc",
+        "//xla:shape_util",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:compiler",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/autotuning:redzone_buffers",
+        "//xla/service/gpu/transforms:dot_algorithm_rewriter",
+        "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/stream_executor/gpu:redzone_allocator",
+        "//xla/stream_executor/rocm:rocblas_plugin",
+        "//xla/tools:hlo_decomposer_lib",
+        "//xla/tsl/lib/gtl:iterator_range",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "hipblaslt",
+    srcs = ["hipblaslt.cc"],
+    hdrs = ["hipblaslt.h"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla:autotuning_proto_cc",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/stream_executor/rocm:amdhipblaslt_plugin",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_test(
+    name = "hipblaslt_test",
+    srcs = ["hipblaslt_test.cc"],
+    backends = [
+        "amdgpu_any",
+    ],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":hipblaslt",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:amdgpu_compiler_impl",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_test(
+    name = "rocblas_test",
+    srcs = ["rocblas_test.cc"],
+    backends = [
+        "amdgpu_any",
+    ],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":rocblas",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:amdgpu_compiler_impl",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "factory_rocm",
     srcs = ["factory_rocm.cc"],
@@ -638,8 +787,8 @@ cc_library(
         "rocm-only",
     ],
     deps = [
-        ":cublas",
         ":factory",
+        ":rocblas",
         ":triton",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
 
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_GPU_AUTOTUNER_CUDA_FACTORY_H_
-#define TENSORFLOW_COMPILER_XLA_BACKENDS_GPU_AUTOTUNER_CUDA_FACTORY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_GPU_AUTOTUNER_ROCM_FACTORY_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_GPU_AUTOTUNER_ROCM_FACTORY_H_
 
 #include <memory>
 #include <vector>
 
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/backends/gpu/autotuner/cublas.h"
 #include "xla/backends/gpu/autotuner/factory.h"
+#include "xla/backends/gpu/autotuner/rocblas.h"
 #include "xla/backends/gpu/autotuner/triton.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
@@ -42,7 +42,7 @@ std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForROCm(
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::make_unique<TritonBackend>(
       debug_options, compiler, target_config, mlir_context));
-  backends.push_back(std::make_unique<CublasBackend>(
+  backends.push_back(std::make_unique<RocblasBackend>(
       stream_executor, debug_options, compiler, target_config));
   return backends;
 }
@@ -66,4 +66,4 @@ STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(GetFissionBackendsROCmRegistration,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_GPU_AUTOTUNER_CUDA_FACTORY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_GPU_AUTOTUNER_ROCM_FACTORY_H_
@@ -0,0 +1,172 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/autotuner/hipblaslt.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+namespace se = ::stream_executor;
+using se::gpu::BlasLt;
+
+using HipblasLtBackendConfig = AutotuneResult::GemmKey;
+
+namespace {
+
+absl::StatusOr<BlasLt::Epilogue> AsBlasLtEpilogue(
+    GemmBackendConfig_Epilogue epilogue) {
+  switch (epilogue) {
+    case GemmBackendConfig::DEFAULT:
+      return BlasLt::Epilogue::kDefault;
+    case GemmBackendConfig::RELU:
+      return BlasLt::Epilogue::kReLU;
+    case GemmBackendConfig::GELU:
+      return BlasLt::Epilogue::kGELU;
+    case GemmBackendConfig::GELU_AUX:
+      return BlasLt::Epilogue::kGELUWithAux;
+    case GemmBackendConfig::SILU:
+      return BlasLt::Epilogue::kSILU;
+    case GemmBackendConfig::BIAS:
+      return BlasLt::Epilogue::kBias;
+    case GemmBackendConfig::BIAS_RELU:
+      return BlasLt::Epilogue::kBiasThenReLU;
+    case GemmBackendConfig::BIAS_GELU:
+      return BlasLt::Epilogue::kBiasThenGELU;
+    case GemmBackendConfig::BIAS_GELU_AUX:
+      return BlasLt::Epilogue::kBiasThenGELUWithAux;
+    case GemmBackendConfig::BIAS_SILU:
+      return BlasLt::Epilogue::kBiasThenSILU;
+    default:
+      return Internal("Unsupported Epilogue.");
+  }
+}
+
+}  // namespace
+
+bool HipblasLtBackend::IsSupported(const HloInstruction& instr) {
+  return IsCublasLtMatmul(instr) || IsCublasLtMatmulF8(instr);
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+HipblasLtBackend::GetSupportedConfigs(const HloInstruction& instr) {
+  if (!IsSupported(instr)) {
+    return std::vector<std::unique_ptr<BackendConfig>>();
+  }
+
+  GpuBackendConfig gpu_config =
+      instr.backend_config<GpuBackendConfig>().value();
+  const GemmBackendConfig& backend_config = gpu_config.gemm_backend_config();
+
+  TF_ASSIGN_OR_RETURN(
+      GemmConfig gemm_config,
+      GemmConfig::For(
+          &instr, target_config().device_description.gpu_compute_capability()));
+
+  TF_ASSIGN_OR_RETURN(BlasLt::Epilogue epilogue,
+                      AsBlasLtEpilogue(backend_config.epilogue()));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
+                      stream_executor()->CreateStream());
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BlasLt::MatmulPlan> plan,
+      se::gpu::BlasLt::GetMatmulPlan(stream.get(), gemm_config, epilogue));
+
+  const Shape& output_shape = instr.shape();
+  if (!output_shape.IsTuple() || output_shape.tuple_shapes().empty()) {
+    return Internal(
+        "Invalid shape for HipblasLt matmul: output is not a non-empty tuple.");
+  }
+  // The last element of the output tuple is the workspace.
+  const int64_t workspace_size =
+      ShapeUtil::ByteSizeOf(output_shape.tuple_shapes().back());
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<BlasLt::MatmulAlgorithm> algorithms,
+      plan->GetAlgorithms(stream.get(), GemmConfig::kNumAlgorithms,
+                          workspace_size));
+  int num_algorithms = algorithms.size();
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.reserve(num_algorithms);
+  for (int i = 0; i < num_algorithms; ++i) {
+    HipblasLtBackendConfig gemm_key;
+    gemm_key.set_algorithm(i);
+    gemm_key.set_autotune_workspace_size(workspace_size);
+    auto any = std::make_unique<google::protobuf::Any>();
+    any->PackFrom(gemm_key);
+    configs.push_back(std::move(any));
+  }
+
+  return configs;
+}
+
+absl::StatusOr<std::unique_ptr<BackendConfig>>
+HipblasLtBackend::GetDefaultConfig(const HloInstruction& instr) {
+  if (!IsSupported(instr)) {
+    return absl::InvalidArgumentError(
+        "Not a HipblasLt custom call instruction.");
+  }
+
+  AutotuneResult::GemmKey gemm_key;
+  gemm_key.set_algorithm(0);
+  auto any = std::make_unique<google::protobuf::Any>();
+  any->PackFrom(gemm_key);
+  return any;
+}
+
+absl::Status HipblasLtBackend::ApplyConfig(HloInstruction& instr,
+                                           const BackendConfig& config) {
+  HipblasLtBackendConfig gemm_key;
+  if (!config.UnpackTo(&gemm_key)) {
+    return absl::InvalidArgumentError(
+        "Failed to unpack HipblasLtBackendConfig from Any.");
+  }
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
+                      instr.backend_config<GpuBackendConfig>());
+  GemmBackendConfig& backend_config = *gpu_config.mutable_gemm_backend_config();
+  backend_config.set_selected_algorithm(gemm_key.algorithm());
+  backend_config.set_autotune_workspace_size(
+      gemm_key.autotune_workspace_size());
+  TF_RETURN_IF_ERROR(instr.set_backend_config(std::move(gpu_config)));
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla