Broadcast-based allgather in host for-loop (#5925)

Priya2698 · web-flow · commit 54d48aebf982 · 2026-03-09T11:33:58.000-07:00
<img width="1680" height="250" alt="Screenshot 2026-02-09 at 1 24 11 PM" src="https://github.com/user-attachments/assets/f439517d-3533-4d05-b15f-6c02fea731bf" /> The broadcast version is very slow so I am not comparing timings until we integrate this with multicast
diff --git a/csrc/host_ir/lower_to_communication.cpp b/csrc/host_ir/lower_to_communication.cpp
@@ -137,35 +137,43 @@ void lowerToAllgather(
       backend));
 }
 
-// Adds one or zero Broadcast communication to the vector 'comms'
+// Either of the following cases is happening:
+// 1. Same mesh: a broadcast-based allgather in a host for loop. `root` is the
+//    for-loop index.
+// 2. Different meshes: we pick the first device in the sender mesh as root.
 void lowerToBroadcast(
     TensorView* input_tv,
     TensorView* output_tv,
     const CommunicatorBackend backend,
+    Val* root,
     std::vector<Expr*>& comms) {
-  // Either of the following two cases is happening.
-  // 1. `sender_mesh` contains only one device. In this case, we broadcast
-  // from that device.
-  // 2. `sender_mesh` contains multiple devices but the input is not sharded.
-  // In this case, we arbitrarily choose the first device of the sender mesh
-  // to be the root.
   const DeviceMesh& sender_mesh = input_tv->getDeviceMesh();
   const DeviceMesh& receiver_mesh = output_tv->getDeviceMesh();
 
-  NVF_ERROR_EQ(sender_mesh.rank(), 1, "sender: ", input_tv);
-  NVF_ERROR_EQ(receiver_mesh.rank(), 1, "receiver: ", output_tv);
-
-  DeviceIdxType root = sender_mesh.at(0);
   Team team = receiver_mesh.vector();
-  if (!receiver_mesh.has(root)) {
-    team.push_back(root);
+
+  if (sender_mesh == receiver_mesh) {
+    NVF_ERROR(
+        root != nullptr,
+        "Root must be provided for broadcast-based allgather in a host for "
+        "loop.");
+  } else {
+    NVF_ERROR_EQ(sender_mesh.rank(), 1, "sender: ", input_tv);
+    NVF_ERROR_EQ(receiver_mesh.rank(), 1, "receiver: ", output_tv);
+    DeviceIdxType root_device = sender_mesh.at(0);
+    if (!receiver_mesh.has(root_device)) {
+      team.push_back(root_device);
+    }
+    root = IrBuilder::create<Val>(
+        getRelativeIndex(team, root_device), DataType::Index);
   }
+
   comms.push_back(IrBuilder::create<Communication>(
       CommunicationType::Broadcast,
       output_tv,
       input_tv,
       team,
-      getRelativeIndex(team, root),
+      root,
       c10d::ReduceOp::RedOpType::UNUSED,
       backend));
 }
@@ -356,11 +364,15 @@ std::optional<CommunicationInfo> getCommunicationInfoForParallelType(
       pairwise_map.mapConsumerToProducer();
 
   IterDomain* p_loop_id = getShardedIterDomain(producer, pt, DomainType::kLoop);
-  IterDomain* c_loop_id = getShardedIterDomain(consumer, pt, DomainType::kLoop);
   IterDomain* p_logical_id =
       p_loop_id ? getLogicalFromLoopId(producer, p_loop_id) : nullptr;
+  IterDomain* c_loop_id = getShardedIterDomain(consumer, pt, DomainType::kLoop);
   IterDomain* c_logical_id =
       c_loop_id ? getLogicalFromLoopId(consumer, c_loop_id) : nullptr;
+  IterDomain* c_stream_id =
+      getShardedIterDomain(consumer, ParallelType::Stream, DomainType::kLoop);
+  IterDomain* c_logical_stream_id =
+      c_stream_id ? getLogicalFromLoopId(consumer, c_stream_id) : nullptr;
 
   const DeviceMesh& producer_mesh = producer->getDeviceMesh();
   const DeviceMesh& consumer_mesh = consumer->getDeviceMesh();
@@ -381,6 +393,19 @@ std::optional<CommunicationInfo> getCommunicationInfoForParallelType(
     }
 
     if (p_loop_id && !c_loop_id) {
+      // Check if we are going from DID -> Stream, which is a ring allgather.
+      // This can be executed as a broadcast or send recvs, which is decided
+      // by the presence of a swizzle in the stream id definition.
+      if (c_logical_stream_id == p2c.at(p_logical_id)) {
+        NVF_CHECK(
+            same_mesh,
+            "Broadcast based allgather in stream parallel requires same "
+            "mesh.")
+        return CommunicationInfo{
+            .type = CommunicationType::Broadcast,
+            .p_sharded_id = p_logical_id,
+            .c_sharded_id = c_logical_stream_id};
+      }
       CommunicationType type =
           same_mesh ? CommunicationType::Allgather : CommunicationType::Gather;
       return CommunicationInfo{
@@ -563,6 +588,7 @@ bool isCommunicationLayoutCompliant(Expr* e) {
 std::vector<Expr*> convertSingleOpToCommunication(
     Expr* e,
     DeviceIdxType my_device_idx,
+    Val* root,
     const CommunicatorBackend backend) {
   FusionGuard fg(e->fusion());
 
@@ -617,7 +643,7 @@ std::vector<Expr*> convertSingleOpToCommunication(
       lowerToAllgather(input_tv, output_tv, backend, comms, my_device_idx);
       break;
     case CommunicationType::Broadcast:
-      lowerToBroadcast(input_tv, output_tv, backend, comms);
+      lowerToBroadcast(input_tv, output_tv, backend, root, comms);
       break;
     case CommunicationType::SendRecv:
       lowerToSendRecv(input_tv, output_tv, backend, comms);
diff --git a/csrc/host_ir/lower_to_communication.h b/csrc/host_ir/lower_to_communication.h
@@ -55,9 +55,15 @@ Layout getCommunicationLayout(
     const CommunicationType type,
     IterDomain* sharded_id);
 
+// Creates a communication expr corresponding to the given
+// resharding expr. In most cases, `root` is inferred based
+// on communication type. However, in some cases, for e.g.
+// decomposing allgather as broadcast in a host for-loop, `root`
+// may be passed in through lowering.
 std::vector<Expr*> convertSingleOpToCommunication(
     Expr* c,
     DeviceIdxType my_device_idx,
+    Val* root = nullptr,
     const CommunicatorBackend backend = CommunicatorBackend::kNccl);
 
 } // namespace nvfuser
diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp
@@ -192,10 +192,16 @@ void lowerSegment(
               out,
               DomainType::kLoop,
               {ParallelType::Stream})) {
-        Val*& sharded_in = replacement_map[in];
-        if (sharded_in == nullptr) {
-          sharded_in = hir::shardByStream(in, innermost.loop->index(), e);
-          innermost_scope.pushBack(sharded_in->definition());
+        if (!replacement_map.contains(in)) {
+          TensorView* sharded_in =
+              hir::shardByStream(in, innermost.loop->index(), e);
+          if (sharded_in != nullptr) {
+            // `sharded_in` is nullptr if the input cannot be sharded by
+            // stream such as in broadcast or collective-permute based
+            // decomposition of allgather.
+            replacement_map[in] = sharded_in;
+            innermost_scope.pushBack(sharded_in->definition());
+          }
         }
       }
 
@@ -207,15 +213,23 @@ void lowerSegment(
               out, ParallelType::Stream, DomainType::kAllocation) == nullptr) {
         innermost.parent_scope->insert(
             innermost.parent_insertion_point, allocate);
-        auto [i, inserted] = replacement_map.emplace(
-            out, hir::shardByStream(out, innermost.loop->index(), e));
-        NVF_ERROR(inserted, "The input segmented fusion should be SSA.");
-        innermost_scope.pushBack(i->second->definition());
+        NVF_ERROR(
+            !replacement_map.contains(out),
+            "The input segmented fusion should be SSA.");
+        TensorView* sharded_out =
+            hir::shardByStream(out, innermost.loop->index(), e);
+        NVF_ERROR(
+            sharded_out != nullptr,
+            "Output could not be sharded by stream: ",
+            out);
+        replacement_map[out] = sharded_out;
+        innermost_scope.pushBack(sharded_out->definition());
       } else {
         innermost_scope.pushBack(allocate);
       }
 
-      for (Expr* c : convertSingleOpToCommunication(e, device_id)) {
+      Val* root = loop_nest.empty() ? nullptr : innermost.loop->index();
+      for (Expr* c : convertSingleOpToCommunication(e, device_id, root)) {
         NVF_ERROR(
             c->isA<Communication>(),
             "Exprs in a Communication group should be Communication: ",
@@ -298,6 +312,10 @@ void lowerSegment(
                   {ParallelType::Stream})) {
             TensorView* sharded_in =
                 hir::shardByStream(in, innermost.loop->index(), e);
+            NVF_ERROR(
+                sharded_in != nullptr,
+                "Input could not be sharded by stream: ",
+                in);
             replacement_map[in] = sharded_in;
             innermost_scope.pushBack(sharded_in->definition());
           }
@@ -318,6 +336,10 @@ void lowerSegment(
             // `out` should be allocated outside the loop.
             TensorView* sharded_out =
                 hir::shardByStream(out, innermost.loop->index(), e);
+            NVF_ERROR(
+                sharded_out != nullptr,
+                "Output could not be sharded by stream: ",
+                out);
             replacement_map[out] = sharded_out;
             innermost_scope.pushBack(sharded_out->definition());
           }
diff --git a/csrc/host_ir/ops.cpp b/csrc/host_ir/ops.cpp
@@ -35,13 +35,13 @@ TensorView* shardByStream(TensorView* source, Val* stream_index, Expr* e) {
       ops::newValLike(source, source->getDataType())->as<TensorView>();
 
   if (std::ranges::find(e->inputs(), source) != e->inputs().end()) {
-    // Propagate the allocation domain from `source` to `destination`.
-    // Consider adding a config to TransformReplay::selfReplay to control what
-    // to propagate, so we don't have to reset the loop domain.
+    // Propagate the domain from `source` to `destination`.
+    // Unparallelize the destination on `ParallelType::Stream` which
+    // will be inferred based on the output of the expression.
     TransformReplay::selfReplay(source->domain(), destination->domain());
-    destination->setLoopDomain(destination->getLogicalDomain());
+    unparallelize(destination, {ParallelType::Stream});
 
-    // Propagate the loop domain from `e` to `destination`. There are two
+    // Propagate ParallelType::Stream from `e` to `destination`. There are two
     // technical challenges:
     // 1. Loop domains are associated with TensorViews, not Exprs. So we
     // find e's reference output, `ref_out`, and propagate its loop domain.
@@ -58,7 +58,7 @@ TensorView* shardByStream(TensorView* source, Val* stream_index, Expr* e) {
     shardLoopLike(
         ref_out,
         destination,
-        deviceAndStreamParallelTypes(),
+        {ParallelType::Stream},
         PropagateDirection::kBackward);
     temp_e->fusion()->removeExpr(temp_e);
     // Fusion::removeExpr sets all outputs' definitions to nullptr, so we need
@@ -68,6 +68,14 @@ TensorView* shardByStream(TensorView* source, Val* stream_index, Expr* e) {
     for (auto* out : e->outputs()) {
       out->setDefinition(e);
     }
+
+    // Destination's loop domain may not be stream-parallelized if the
+    // corresponding id is already sharded such as in
+    // broadcast/collective-permute based decomposition of allgather.
+    if (getShardedIterDomain(
+            destination, ParallelType::Stream, DomainType::kLoop) == nullptr) {
+      return nullptr;
+    }
   } else {
     NVF_ERROR(
         std::ranges::find(e->outputs(), source) != e->outputs().end(),
@@ -89,8 +97,10 @@ TensorView* shardByStream(TensorView* source, Val* stream_index, Expr* e) {
           destination, ParallelType::Stream, DomainType::kAllocation) !=
           nullptr,
       "Destination allocation should be sharded on stream after "
-      "shardAllocationAsLoop: ",
-      destination);
+      "shardAllocationAsLoop. ",
+      destination->name(),
+      ":",
+      destination->domain()->toString(0, /*loop_only=*/false));
 
   // Refine the contiguity flags so `out` aliases `in`. This is done similar
   // to AliasFinder::handle(const SliceOp*). We scan through the allocation
diff --git a/csrc/host_ir/ops.h b/csrc/host_ir/ops.h
@@ -21,16 +21,20 @@
 namespace nvfuser::hir {
 
 // Creates a ShardByStream without needing the destination TensorView. Returns
-// the destination TensorView. `e` is the Expr from which we propagate the loop
-// domain from. `source` must be either an input or an output of `e`. The
-// destination TensorView will have a loop domain that's consistent with `e` and
-// an allocation domain that's a shard of `source`.
+// the destination TensorView. `e` is the Expr from which we propagate
+// `ParallelType::Stream` domain from. `source` must be either an input or an
+// output of `e`. The destination TensorView will have a `ParallelType::Stream`
+// domain that's consistent with `e` and an allocation domain that's a shard of
+// `source`.
 //
 // Why is `e` unnecessary? I made a mistake previously to propagate `source`'s
 // loop domain to `destination`. This broke
 // test_stream.py::test_two_matmuls_not_inlinable because, when `source` is an
 // input of `e`, `source`'s loop domain reflects its producing Expr rather than
 // `e`.
+// If `destination` cannot be sharded by `ParallelType::Stream`, returns
+// nullptr. For e.g.: in decomposed allgather, we go from DIDx -> Stream.
+// `destination` is already sharded on `DIDx`
 TensorView* shardByStream(TensorView* source, Val* stream_index, Expr* e);
 
 } // namespace nvfuser::hir
diff --git a/csrc/host_ir/pass/convert_op_to_communication.cpp b/csrc/host_ir/pass/convert_op_to_communication.cpp
@@ -35,7 +35,10 @@ void ConvertOpToCommunication::passImplementation(Fusion* fusion) {
       return new_top_level_exprs.push_back(top_level_expr);
     }
     for (auto* expr : nvfuser::convertSingleOpToCommunication(
-             top_level_expr, my_device_index, params_.communicator_backend)) {
+             top_level_expr,
+             my_device_index,
+             /*root=*/nullptr,
+             params_.communicator_backend)) {
       // Allocate the recv buffers of communications
       if (expr->isA<Communication>()) {
         auto* communication = expr->as<Communication>();
diff --git a/csrc/multidevice/propagation.cpp b/csrc/multidevice/propagation.cpp
@@ -350,26 +350,28 @@ void canonicalizeLoopDomain(TensorView* tv) {
            {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()},
            {tv->getLoopDomain().begin(), tv->getLoopDomain().end()}) |
            std::views::reverse) {
-    auto* split = dynamic_cast<Split*>(transform);
-    NVF_ERROR(
-        split != nullptr,
-        "Only splits are expected so far, but found: ",
-        transform);
-
-    if (split->outer()->isParallelized() || split->inner()->isParallelized()) {
+    if (std::ranges::any_of(
+            ir_utils::filterByType<IterDomain>(transform->outputs()),
+            [&loop](IterDomain* id) {
+              return id->isParallelized() || !loop.contains(id);
+            })) {
       continue;
     }
-
-    if (!loop.contains(split->outer()) || !loop.contains(split->inner())) {
+    if (auto* swizzle1d = dynamic_cast<Swizzle1D*>(transform)) {
+      auto it = loop.erase(swizzle1d->out()).second;
+      loop.insert(it, swizzle1d->in(), std::monostate());
       continue;
     }
-
-    loop.erase(split->outer());
-    const auto inner_i = loop.erase(split->inner()).second;
-    // `inner_i` is picked arbitrarily as the insertion point. Given `in`,
-    // `outer` and `inner` are all serial, `in`'s position in the loop domain
-    // doesn't matter.
-    loop.insert(inner_i, split->in(), std::monostate());
+    if (auto* split = dynamic_cast<Split*>(transform)) {
+      loop.erase(split->outer());
+      const auto inner_i = loop.erase(split->inner()).second;
+      // `inner_i` is picked arbitrarily as the insertion point. Given `in`,
+      // `outer` and `inner` are all serial, `in`'s position in the loop domain
+      // doesn't matter.
+      loop.insert(inner_i, split->in(), std::monostate());
+      continue;
+    }
+    NVF_THROW("Expected a swizzle1d or split transform. Got: ", transform);
   }
 
   auto new_loop = std::views::keys(loop);
diff --git a/tests/python/multidevice/test_overlap.py b/tests/python/multidevice/test_overlap.py
diff --git a/tools/linter/adapters/clangtidy_linter.py b/tools/linter/adapters/clangtidy_linter.py