NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/host_ir/evaluator.cpp‎
Lines changed: 29 additions & 8 deletions b/‎csrc/host_ir/evaluator.cpp‎
Lines changed: 29 additions & 8 deletions
@@ -1243,6 +1243,7 @@ list(APPEND NVFUSER_RUNTIME_FILES
   ${NVFUSER_ROOT}/runtime/mbarrier.cu
   ${NVFUSER_ROOT}/runtime/memory.cu
   ${NVFUSER_ROOT}/runtime/multicast.cu
+  ${NVFUSER_ROOT}/runtime/multicast_reduce.cu
   ${NVFUSER_ROOT}/runtime/alltoallv.cu
   ${NVFUSER_ROOT}/runtime/tma_copy.cu
   ${NVFUSER_ROOT}/runtime/random_numbers.cu
 
@@ -48,7 +48,7 @@ HostIrEvaluator::HostIrEvaluator(
       expr_evaluator_(),
       my_local_device_index_(
           communicator_ == nullptr ? 0 : communicator_->local_rank()),
-      ipc_handle_cache_(expr_evaluator_),
+      ipc_handle_cache_(&expr_evaluator_),
       multicast_handle_cache_() {
   const DeviceIdxType device_index =
       communicator_ == nullptr ? 0 : communicator_->deviceId();
@@ -364,16 +364,27 @@ void HostIrEvaluator::handle(Communication* communication) {
             .stream());
     NVF_ERROR(
         communication->type() == CommunicationType::Broadcast ||
-            communication->type() == CommunicationType::Allgather,
-        "Invalid communication type, expected Broadcast or Allgather, got: ",
+            communication->type() == CommunicationType::Allgather ||
+            communication->type() == CommunicationType::Allreduce ||
+            communication->type() == CommunicationType::Reduce,
+        "Invalid communication type for CUDA backend, got: ",
         communication->type());
     int64_t root_val =
         expr_evaluator_.evaluate(communication->root()).as<int64_t>();
+    int64_t cache_root =
+        (communication->type() == CommunicationType::Broadcast ||
+         communication->type() == CommunicationType::Reduce)
+        ? root_val
+        : -1;
+    // For Reduce, non-roots may have no output; use input for cache key
+    at::Tensor cache_buffer =
+        output_tensor.defined() ? output_tensor : input_tensor;
     SymmetricMemoryHandle* multicast_handle = multicast_handle_cache_.get(
-        {.buffer = output_tensor, .expr = communication, .root = root_val});
+        {.buffer = cache_buffer, .expr = communication, .root = cache_root});
     postWithCudaBackend(
         communication,
         input_tensor,
+        output_tensor,
         multicast_handle,
         current_stream,
         root_val);
@@ -492,15 +503,25 @@ void HostIrEvaluator::handle(Wait* wait) {
       communication && communication->backend() == CommunicatorBackend::kCuda) {
     NVF_ERROR(
         communication->type() == CommunicationType::Broadcast ||
-            communication->type() == CommunicationType::Allgather,
-        "Invalid communication type, only Broadcast and Allgather are "
-        "supported with cuda backend, got: ",
+            communication->type() == CommunicationType::Allgather ||
+            communication->type() == CommunicationType::Allreduce ||
+            communication->type() == CommunicationType::Reduce,
+        "Invalid communication type for CUDA backend, got: ",
         communication->type());
     at::Tensor output_tensor = getKnownTensorOrUndefined(communication->out());
+    at::Tensor input_tensor =
+        getKnownTensorOrUndefined(communication->input(0));
     int64_t root_val =
         expr_evaluator_.evaluate(communication->root()).as<int64_t>();
+    int64_t cache_root =
+        (communication->type() == CommunicationType::Broadcast ||
+         communication->type() == CommunicationType::Reduce)
+        ? root_val
+        : -1;
+    at::Tensor cache_buffer =
+        output_tensor.defined() ? output_tensor : input_tensor;
     SymmetricMemoryHandle* multicast_handle = multicast_handle_cache_.get(
-        {.buffer = output_tensor, .expr = communication, .root = root_val});
+        {.buffer = cache_buffer, .expr = communication, .root = cache_root});
     waitWithCudaBackend(
         communication, multicast_handle, current_stream, root_val);
   } else {