Tag PjRt migration candidates explicitly. (BUILD changes)

nvgrw · Google-ML-Automation · commit d62449cf2569 · 2026-01-07T10:05:53.000-08:00
This change adds a new tag "pjrt_migration_candidate" to all test targets that
depend on HloTestBase, ClientLibraryTestBase, and HloRunnerTpuSystem.

This change also adds a new `use_legacy_runtime` kwarg to `xla_test`, which acts
as a replacement for "test_migrated_to_hlo_runner_pjrt".  During a brief
transition phase, we will leave all "test_migrated_to_hlo_runner_pjrt" tags in
place so that we can identify any tests that have the `use_legacy_runtime` set
to an incorrect value.

"pjrt_migration_candidate" and "test_migrated_to_hlo_runner_pjrt" are mutually
exclusive.

"pjrt_migration_candidate" should not appear on any tests using the new runtime.

Unlike "test_migrated_to_hlo_runner_pjrt", which primarily tags `xla_test`
targets, "pjrt_migration_candidate" intends to tag all outstanding migration
candidates to obtain an accurate picture of migration progress. If a test cannot
or should not be migrated, it can be excluded from any analysis just by removing
the tag.

PiperOrigin-RevId: 852941492
diff --git a/xla/backends/cpu/transforms/BUILD b/xla/backends/cpu/transforms/BUILD
@@ -118,6 +118,7 @@ cc_library(
 xla_cc_test(
     name = "ynn_matcher_test",
     srcs = ["ynn_matcher_test.cc"],
+    tags = ["pjrt_migration_candidate"],
     deps = [
         "//xla:xla_proto_cc",
         "//xla/service:cpu_plugin",
diff --git a/xla/backends/gpu/runtime/BUILD b/xla/backends/gpu/runtime/BUILD
@@ -896,6 +896,7 @@ cc_library(
     deps = [
         ":thunk",
         ":thunk_proto_cc",
+        "//xla:shape_util",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
@@ -955,6 +956,8 @@ xla_test(
     name = "gpublas_lt_matmul_thunk_test",
     srcs = ["gpublas_lt_matmul_thunk_test.cc"],
     backends = ["gpu"],
+    tags = ["pjrt_migration_candidate"],
+    use_legacy_runtime = True,
     deps = [
         ":gpublas_lt_matmul_thunk",
         ":thunk",
@@ -1634,6 +1637,8 @@ xla_test(
     name = "collective_broadcast_thunk_test",
     srcs = ["collective_broadcast_thunk_test.cc"],
     backends = ["gpu"],
+    tags = ["pjrt_migration_candidate"],
+    use_legacy_runtime = True,
     deps = [
         ":collective_broadcast_thunk",
         ":collective_thunk",
@@ -1718,6 +1723,8 @@ xla_test(
     name = "collective_permute_thunk_test",
     srcs = ["collective_permute_thunk_test.cc"],
     backends = ["gpu"],
+    tags = ["pjrt_migration_candidate"],
+    use_legacy_runtime = True,
     deps = [
         ":collective_permute_thunk",
         ":collective_thunk",
@@ -3594,6 +3601,8 @@ xla_test(
     name = "runtime_intrinsics_test",
     srcs = ["runtime_intrinsics_test.cc"],
     backends = ["gpu"],
+    tags = ["pjrt_migration_candidate"],
+    use_legacy_runtime = True,
     deps = [
         ":runtime_intrinsics",
         "//xla:literal",
diff --git a/xla/backends/gpu/runtime/command_buffer_cmd.cc b/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -1550,13 +1550,16 @@ absl::Status GemmCmd::Record(const Thunk::ExecuteParams& execute_params,
 }
 
 CommandBufferCmd::BufferUseVector GemmCmd::buffers() const {
+  CommandBufferCmd::BufferUseVector res{
+      BufferUse::Read(lhs_buffer_, config_.lhs_layout.ToShape()),
+      BufferUse::Read(rhs_buffer_, config_.rhs_layout.ToShape()),
+      BufferUse::Write(output_buffer_, config_.output_layout.ToShape()),
+  };
   if (workspace_.has_value()) {
-    return {BufferUse::Read(lhs_buffer_), BufferUse::Read(rhs_buffer_),
-            BufferUse::Write(output_buffer_),
-            BufferUse::Write(workspace_.value())};
+    res.push_back(BufferUse::Write(
+        *workspace_, ShapeUtil::MakeShape(S8, {workspace_->size()})));
   }
-  return {BufferUse::Read(lhs_buffer_), BufferUse::Read(rhs_buffer_),
-          BufferUse::Write(output_buffer_)};
+  return res;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/xla/backends/gpu/runtime/gemm_thunk.cc b/xla/backends/gpu/runtime/gemm_thunk.cc
@@ -25,9 +25,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/stream.h"
@@ -81,6 +83,21 @@ absl::Status GemmThunk::Initialize(const InitializeParams& params) {
   return absl::OkStatus();
 }
 
+Thunk::BufferUses GemmThunk::buffer_uses() const {
+  BufferUses res{
+      BufferUse::Read(lhs_buffer_, config_.lhs_layout.ToShape()),
+      BufferUse::Read(rhs_buffer_, config_.rhs_layout.ToShape()),
+      BufferUse::Write(output_buffer_, config_.output_layout.ToShape()),
+  };
+
+  if (workspace_.has_value()) {
+    res.push_back(BufferUse::Write(
+        *workspace_, ShapeUtil::MakeShape(S8, {workspace_->size()})));
+  }
+
+  return res;
+}
+
 absl::StatusOr<ThunkProto> GemmThunk::ToProto() const {
   ThunkProto proto;
   *proto.mutable_thunk_info() = thunk_info().ToProto();
diff --git a/xla/backends/gpu/runtime/gemm_thunk.h b/xla/backends/gpu/runtime/gemm_thunk.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
 
@@ -49,7 +48,7 @@ class GemmThunk : public Thunk {
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
 
-  GemmConfig config() const { return config_; }
+  const GemmConfig& config() const { return config_; }
   BufferAllocation::Slice lhs_buffer() const { return lhs_buffer_; }
   BufferAllocation::Slice rhs_buffer() const { return rhs_buffer_; }
   BufferAllocation::Slice output_buffer() const { return output_buffer_; }
@@ -58,13 +57,7 @@ class GemmThunk : public Thunk {
   }
   bool deterministic() const { return deterministic_; }
 
-  BufferUses buffer_uses() const override {
-    return {
-        BufferUse::Read(lhs_buffer_),
-        BufferUse::Read(rhs_buffer_),
-        BufferUse::Write(output_buffer_),
-    };
-  }
+  BufferUses buffer_uses() const override;
 
   static absl::StatusOr<std::unique_ptr<GemmThunk>> FromProto(
       ThunkInfo thunk_info, const GemmThunkProto& proto,
diff --git a/xla/service/gpu/transforms/convert_triton_gemm_config.h b/xla/service/gpu/transforms/convert_triton_gemm_config.h
@@ -29,21 +29,12 @@ limitations under the License.
 
 namespace xla::gpu {
 
-// Rewrites supported Triton GEMM fusions to generic Triton fusions.
+// Annotates instructions inside the triton_gemm fusions with the tiling
+// parameters from its backend config.
 //
-// Fusions with kind kCustom and fusion_backend_config.kind "__triton_gemm" are
-// rewritten to fusion_backend_config.kind
-// "__triton_nested_fusion_gemm".
-//
-// While this new fusion kind is supported by generic triton emitter we want
-// to distinguish it from "__triton" as we don't want other passes to modify the
-// resulting fusions.
-//
-// The fusion's backend config is set to a BlockLevelFusionConfig, derived from
-// a previously set TritonGemmConfig.
-//
-// The operands of the dot (including their prologues) are fused into two new
-// nested fusions, each with their own BlockLevelFusionConfig.
+// Replaces the fusion kind with "__triton_nested_gemm_fusion" and sets the
+// fusion's backend config a BlockLevelFusionConfig, derived from
+// TritonGemmConfig.
 class ConvertTritonGemmConfig : public HloModulePass {
  public:
   explicit ConvertTritonGemmConfig(
diff --git a/xla/stream_executor/gpu/gpu_blas_lt.cc b/xla/stream_executor/gpu/gpu_blas_lt.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.pb.h"
@@ -202,6 +204,15 @@ xla::GemmConfigProto::MatrixLayout MatrixLayout::ToProto() const {
   return proto;
 }
 
+xla::Shape MatrixLayout::ToShape() const {
+  switch (order) {
+    case Order::kRowMajor:
+      return xla::ShapeUtil::MakeShape(dtype, {num_cols, num_rows, batch_size});
+    case Order::kColumnMajor:
+      return xla::ShapeUtil::MakeShape(dtype, {num_rows, num_cols, batch_size});
+  }
+}
+
 absl::StatusOr<ComputationType> GetBlasComputationType(
     xla::PrecisionConfig::Algorithm algorithm, xla::PrimitiveType lhs_dtype,
     xla::PrimitiveType output_dtype, int64_t compute_precision,
diff --git a/xla/stream_executor/gpu/gpu_blas_lt.h b/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/shape.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_description.h"
@@ -83,6 +84,8 @@ struct MatrixLayout {  // plain MatrixLayout which is extended with create
   static absl::StatusOr<MatrixLayout> FromProto(
       const xla::GemmConfigProto::MatrixLayout& proto);
   xla::GemmConfigProto::MatrixLayout ToProto() const;
+
+  xla::Shape ToShape() const;
 };
 
 // compact version of the matrix layout to be used to pass matrices