[XLA:GPU] Prevent all-reduce codegen when replica groups are empty

sohaibiftikhar · Google-ML-Automation · commit 9941e9a8c303 · 2025-11-26T11:16:52.000-08:00
Generating collective code when participating devices are not specified
is not possible unless topology information is available during compilation.

This change bails out of codegen for empty replica_groups for this reason.

PiperOrigin-RevId: 837190862
diff --git a/xla/backends/gpu/codegen/triton/BUILD b/xla/backends/gpu/codegen/triton/BUILD
@@ -1054,6 +1054,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1063,7 +1064,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
         "@triton//:TritonDialects",
     ],
 )
@@ -1080,7 +1080,6 @@ xla_cc_test(
         "//xla:status_macros",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_query",
@@ -1095,6 +1094,7 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
diff --git a/xla/backends/gpu/codegen/triton/collective_emitter.cc b/xla/backends/gpu/codegen/triton/collective_emitter.cc
@@ -19,18 +19,20 @@ limitations under the License.
 #include <optional>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
@@ -80,32 +82,68 @@ static constexpr auto kGlobalAddressSpace =
         mlir::NVVM::NVVMMemorySpace::Global);
 
 // Metadata arguments for the collective emitter.
-// device_rank, signal-value, signal_buffers.
+// device_rank, signal_value, signal_buffers.
 static constexpr int32_t kNumCollectiveMetadataArgs = 3;
 
-bool CanAllReduceBeEmitted(const HloAllReduceInstruction* all_reduce,
-                           ReductionKind reduction_kind, int64_t num_devices,
-                           int64_t num_elements, PrimitiveType element_type,
-                           AllReduceStrategy all_reduce_strategy) {
+struct AllReduceInfo {
+  ReductionKind reduction_kind;
+  int64_t num_devices;
+  int64_t num_elements;
+  PrimitiveType element_type;
+  AllReduceStrategy all_reduce_strategy;
+};
+
+// Returns the AllReduceInfo for the given all-reduce instruction if the
+// instruction is supported by the codegen.
+std::optional<AllReduceInfo> MaybeBuildAllReduceInfo(
+    const HloAllReduceInstruction* all_reduce) {
   if (!all_reduce->GetModule()
            ->config()
            .debug_options()
            .xla_gpu_unsupported_use_all_reduce_one_shot_kernel()) {
-    return false;
+    return std::nullopt;
+  }
+  if (all_reduce->device_list().replica_groups().empty()) {
+    VLOG(1) << "Replica groups are empty for " << all_reduce->name()
+            << ". Codegen will not be supported.";
+    return std::nullopt;
   }
+  const int64_t num_devices = all_reduce->device_list().num_devices_per_group();
+  const std::optional<ReductionKind> reduction_kind =
+      MatchReductionComputation(all_reduce->called_computations().front());
+  if (!reduction_kind.has_value()) {
+    return std::nullopt;
+  }
+  const int64_t num_elements =
+      ShapeUtil::ElementsIn(all_reduce->operand(0)->shape());
+  const PrimitiveType element_type =
+      all_reduce->operand(0)->shape().element_type();
+  // NB: We do not codegen multimem kernels for now.
+  const AllReduceStrategy all_reduce_strategy =
+      GetAllReduceStrategy(num_elements, /*is_multimem_enabled=*/false);
   // TODO(b/383125489): Support variadic all-reduce.
   if (all_reduce->operand_count() > 1) {
-    return false;
+    return std::nullopt;
   }
   const int64_t byte_size =
       num_elements * ShapeUtil::ByteSizeOfPrimitiveType(element_type);
   // TODO(b/457333991): Support twoShot for codegen.
   if (byte_size >
       GetMaxSupportedAllReduceSizeBytes(AllReduceStrategy::kOneShot)) {
-    return false;
+    return std::nullopt;
   }
-  return IsAllReduceKernelSupported(num_devices, num_elements, element_type,
-                                    reduction_kind, all_reduce_strategy);
+  if (!IsAllReduceKernelSupported(num_devices, num_elements, element_type,
+                                  reduction_kind.value(),
+                                  all_reduce_strategy)) {
+    return std::nullopt;
+  }
+  return AllReduceInfo{
+      /* .reduction_kind= */ reduction_kind.value(),
+      /* .num_devices= */ num_devices,
+      /* .num_elements= */ num_elements,
+      /* .element_type= */ element_type,
+      /* .all_reduce_strategy= */ all_reduce_strategy,
+  };
 }
 
 // The logic here is very naive and assumes a monotonic layout
@@ -114,27 +152,15 @@ absl::StatusOr<std::optional<BlockLevelFusionConfig>>
 GetBlockLevelFusionConfigForAllReduce(
     const se::DeviceDescription& device_info,
     const HloAllReduceInstruction* all_reduce) {
-  const std::optional<ReductionKind> reduction_kind =
-      MatchReductionComputation(all_reduce->called_computations().front());
-  if (!reduction_kind.has_value()) {
-    return absl::InternalError(
-        "Reduction computation not found for all-reduce.");
-  }
-  const int64_t num_devices = all_reduce->device_list().num_devices_per_group();
-  const int64_t num_elements =
-      ShapeUtil::ElementsIn(all_reduce->operand(0)->shape());
-  const PrimitiveType element_type =
-      all_reduce->operand(0)->shape().element_type();
-  // NB: We do not codegen multimem kernels for now.
-  const AllReduceStrategy all_reduce_strategy =
-      GetAllReduceStrategy(num_elements, /*is_multimem_enabled=*/false);
-  if (!CanAllReduceBeEmitted(all_reduce, reduction_kind.value(), num_devices,
-                             num_elements, element_type, all_reduce_strategy)) {
+  const std::optional<AllReduceInfo> all_reduce_info =
+      MaybeBuildAllReduceInfo(all_reduce);
+  if (!all_reduce_info.has_value()) {
     return std::nullopt;
   }
   const Shape& output_shape = all_reduce->shape();
-  const LaunchDimensions launch_dims =
-      AllReduceLaunchDimensions(num_elements, num_devices, all_reduce_strategy);
+  const LaunchDimensions launch_dims = AllReduceLaunchDimensions(
+      all_reduce_info->num_elements, all_reduce_info->num_devices,
+      all_reduce_info->all_reduce_strategy);
   BlockLevelFusionConfig block_level_config;
   block_level_config.set_num_warps(launch_dims.num_threads_per_block() /
                                    WarpSize(device_info));
@@ -143,8 +169,8 @@ GetBlockLevelFusionConfigForAllReduce(
   Tile* output_tile = block_level_config.add_output_tiles();
   const int64_t rank = output_shape.dimensions().size();
 
-  // Tile sizes are rolled up to power of 2 because this is what the triton
-  // expects (and consequently the tiling infra).
+  // Tile sizes are rolled up to power of 2 because this is what triton expects
+  // and consequently the tiling infra.
   for (int i = 0; i < rank - 1; ++i) {
     output_tile->add_sizes(llvm::PowerOf2Ceil(output_shape.dimensions(i)));
   }
diff --git a/xla/backends/gpu/codegen/triton/collective_emitter_test.cc b/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
@@ -85,8 +86,7 @@ class CollectiveBlockLevelConfigTest : public HloHardwareIndependentTestBase {
       : device_info_{TestGpuDeviceInfo::RTXH100SXMDeviceInfo()} {}
 
   absl::StatusOr<ModuleWithFusion> BuildModuleWithFusion(
-      const Shape& shape) const {
-    const std::string module_str = GetModuleStr(shape);
+      std::string module_str) const {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                         ParseAndReturnVerifiedModule(module_str));
     const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
@@ -100,7 +100,8 @@ class CollectiveBlockLevelConfigTest : public HloHardwareIndependentTestBase {
   }
 
  protected:
-  static std::string GetModuleStr(const Shape& shape) {
+  static std::string GetModuleStr(const Shape& shape,
+                                  absl::string_view replica_groups = "{0,1}") {
     return absl::StrFormat(R"(
       HloModule test
       apply_op {
@@ -111,11 +112,11 @@ class CollectiveBlockLevelConfigTest : public HloHardwareIndependentTestBase {
 
       ENTRY test_computation {
         param_0 = %1$s parameter(0)
-        all-reduce-start = %1$s all-reduce-start(param_0), to_apply=apply_op, replica_groups={{0,1}}
+        all-reduce-start = %1$s all-reduce-start(param_0), to_apply=apply_op, replica_groups={%2$s}
         ROOT all-reduce-done = %1$s all-reduce-done(all-reduce-start)
       }
     )",
-                           shape.ToString());
+                           shape.ToString(), replica_groups);
   }
 
   const se::DeviceDescription device_info_;
@@ -124,9 +125,9 @@ class CollectiveBlockLevelConfigTest : public HloHardwareIndependentTestBase {
 class CollectiveEmitterTest : public CollectiveBlockLevelConfigTest {
  public:
   absl::StatusOr<std::unique_ptr<ModuleWithEmitter>> BuildModuleWithEmitter(
-      const Shape& shape, const se::DeviceDescription& device_info) const {
+      std::string module_str, const se::DeviceDescription& device_info) const {
     TF_ASSIGN_OR_RETURN(ModuleWithFusion module_with_fusion,
-                        BuildModuleWithFusion(shape));
+                        BuildModuleWithFusion(std::move(module_str)));
     TF_ASSIGN_OR_RETURN(
         bool collective_fusion_config_set,
         TrySetGpuBackendConfigForCollective(
@@ -174,7 +175,7 @@ class CollectiveEmitterParameterizedTest
 TEST_P(CollectiveEmitterParameterizedTest, AllReduceBlockLevelConfig) {
   const auto& param = GetParam();
   TF_ASSERT_OK_AND_ASSIGN(const auto module_with_fusion,
-                          BuildModuleWithFusion(param.shape));
+                          BuildModuleWithFusion(GetModuleStr(param.shape)));
   TF_ASSERT_OK_AND_ASSIGN(const auto block_level_config,
                           GetCollectiveBlockLevelFusionConfig(
                               device_info_, module_with_fusion.FusionInstr()));
@@ -207,10 +208,22 @@ INSTANTIATE_TEST_SUITE_P(
       return info.param.test_name;
     });
 
+TEST_F(CollectiveEmitterTest, AllReduceBlockLevelConfigNoReplicaGroups) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      const auto module_with_fusion,
+      BuildModuleWithFusion(GetModuleStr(ShapeUtil::MakeShape(F32, {65536}),
+                                         /* replica_groups= */ "")));
+  TF_ASSERT_OK_AND_ASSIGN(const auto block_level_config,
+                          GetCollectiveBlockLevelFusionConfig(
+                              device_info_, module_with_fusion.FusionInstr()));
+  EXPECT_EQ(block_level_config, std::nullopt);
+}
+
 TEST_F(CollectiveEmitterTest, AllReduceWithTritonGetLaunchConfig) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<ModuleWithEmitter> result_ptr,
-      BuildModuleWithEmitter(ShapeUtil::MakeShape(F32, {65536}), device_info_));
+      BuildModuleWithEmitter(GetModuleStr(ShapeUtil::MakeShape(F32, {65536})),
+                             device_info_));
   auto& result = *result_ptr;
   const TritonFusion* triton_fusion = result.emitter.get();
   ASSERT_NE(triton_fusion, nullptr);
@@ -223,7 +236,8 @@ TEST_F(CollectiveEmitterTest, AllReduceWithTritonGetLaunchConfig) {
 TEST_F(CollectiveEmitterTest, AllReduceWithTritonGenerateTritonKernel) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<ModuleWithEmitter> result,
-      BuildModuleWithEmitter(ShapeUtil::MakeShape(F32, {65536}), device_info_));
+      BuildModuleWithEmitter(GetModuleStr(ShapeUtil::MakeShape(F32, {65536})),
+                             device_info_));
   const TritonFusion* triton_fusion = result->emitter.get();
   ASSERT_NE(triton_fusion, nullptr);
   TF_ASSERT_OK_AND_ASSIGN(