refine barrier stream and add asyc_op to log (#1824)

Chao1Han · web-flow · commit 7b4ff01d437b · 2025-07-10T07:16:16.000Z
diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
@@ -755,7 +755,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::send(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      "N/A"); // async_op
 
   auto ret = pointToPoint(
       tensor,
@@ -804,7 +805,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::recv(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      "N/A"); // async_op
 
   auto ret = pointToPoint(
       tensor,
@@ -889,7 +891,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
       std::vector<int64_t>(), // outSplitSize
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   auto inputs = std::vector<at::Tensor>{inputTensor};
   return collective(
@@ -1003,7 +1006,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::scatter(
       std::vector<int64_t>(), // outSplitSize
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   const auto root = opts.rootRank;
 
@@ -1131,7 +1135,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      size_); // worldSize
+      size_, // worldSize
+      opts.asyncOp); // async_op
 
   return allreduce_impl(tensor, "xccl:all_reduce", opts);
 }
@@ -1157,7 +1162,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   return collectiveCoalesced(
       tensors,
@@ -1219,7 +1225,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   const auto root = opts.rootRank + opts.rootTensor;
 
@@ -1310,7 +1317,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   return collective(
       tensor,
@@ -1419,7 +1427,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
       std::vector<int64_t>(), // outSplitSize
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   bool same_size = checkSameSize(outputTensors_);
   if (same_size) {
@@ -1506,7 +1515,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
       std::vector<int64_t>(), // outSplitSize
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   return collective(
       input_tensor,
@@ -1552,7 +1562,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather_into_tensor_coalesced(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   return collectiveCoalesced(
       inputs,
@@ -1603,7 +1614,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   bool same_size = checkSameSize(inputTensors_);
   if (same_size) {
@@ -1700,7 +1712,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   return collective(
       inputTensor,
@@ -1740,7 +1753,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
     std::vector<at::Tensor>& outputs,
     std::vector<at::Tensor>& inputs,
     const ReduceScatterOptions& opts) {
-
   RECORD_PARAM_COMMS_DATA_WITH_LOG(
       std::make_tuple(
           static_cast<int64_t>(seqCollective_) + 1,
@@ -1758,7 +1770,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   return collectiveCoalesced(
       inputs,
@@ -1794,6 +1807,25 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
       "xccl:reduce_scatter_tensor_coalesced");
 }
 
+c10::DeviceIndex ProcessGroupXCCL::guessDeviceId() const {
+  if (getBoundDeviceId().has_value()) {
+    return getBoundDeviceId().value().index();
+  } else if (!usedDeviceIdxs_.empty()) {
+    return *usedDeviceIdxs_.begin();
+  }
+  int devIdx =
+      static_cast<int16_t>(rank_ % at::detail::getXPUHooks().getNumGPUs());
+  LOG(WARNING)
+      << logPrefix()
+      << c10::str(
+             " using GPU ",
+             devIdx,
+             " as device used by this process is currently unknown. ",
+             "This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
+             "You can specify device_id in init_process_group() to force use of a particular device.");
+  return static_cast<c10::DeviceIndex>(devIdx);
+}
+
 c10::intrusive_ptr<Work> ProcessGroupXCCL::barrier(const BarrierOptions& opts) {
   RECORD_PARAM_COMMS(
       static_cast<int>(
@@ -1810,18 +1842,13 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::barrier(const BarrierOptions& opts) {
       -1, // globalRankStride
       this->getSize()); // worldSize
   // Device to use for barrier
-  int barDevIdx = -1;
+  c10::DeviceIndex barDevIdx = -1;
 
   // See nccl barrier comments
   if (!opts.device_ids.empty()) {
-    barDevIdx = opts.device_ids[0];
-  } else if (getBoundDeviceId()) {
-    barDevIdx = (*getBoundDeviceId()).index();
-  } else if (!usedDeviceIdxs_.empty()) {
-    barDevIdx = *usedDeviceIdxs_.begin();
+    barDevIdx = static_cast<c10::DeviceIndex>(opts.device_ids[0]);
   } else {
-    barDevIdx =
-        static_cast<int16_t>(rank_ % at::detail::getXPUHooks().getNumGPUs());
+    barDevIdx = guessDeviceId();
   }
 
   TORCH_CHECK_WITH(
@@ -1833,12 +1860,20 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::barrier(const BarrierOptions& opts) {
   at::Tensor barrierTensor =
       at::zeros({1}, at::TensorOptions().device(barDevice).dtype(at::kFloat));
 
-  auto work = allreduce_impl(barrierTensor, "xccl:all_reduce_barrier");
+  AllreduceOptions arOpts = AllreduceOptions();
+  arOpts.asyncOp = opts.asyncOp;
+  auto work = allreduce_impl(barrierTensor, "xccl:all_reduce_barrier", arOpts);
+
+  if (opts.asyncOp) {
+    auto xcclWork = dynamic_cast<ProcessGroupXCCL::WorkXCCL*>(work.get());
+    TORCH_CHECK(xcclWork);
+    xcclWork->isBarrierOp_ = true;
+    return work;
+  }
 
-  auto xcclWork = dynamic_cast<ProcessGroupXCCL::WorkXCCL*>(work.get());
-  TORCH_CHECK(xcclWork);
-  xcclWork->isBarrierOp_ = true;
-  return work;
+  auto currentStream = at::xpu::getCurrentXPUStream(barDevIdx);
+  currentStream.synchronize();
+  return nullptr;
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
@@ -1866,7 +1901,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
         std::vector<int64_t>(), // outSplitSizes
         -1, // globalRankStart
         -1, // globalRankStride
-        this->getSize()); // worldSize
+        this->getSize(), // worldSize
+        opts.asyncOp); // async_op
+
     TORCH_CHECK(
         outputTensor.numel() == inputTensor.numel() &&
             outputTensor.scalar_type() == inputTensor.scalar_type(),
@@ -1915,7 +1952,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
         outputSplitSizes, // outSplitSizes
         -1, // globalRankStart
         -1, // globalRankStride
-        this->getSize()); // worldSize
+        this->getSize(), // worldSize
+        opts.asyncOp); // async_op
 
     return collective(
         inputTensor,
@@ -1991,7 +2029,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall(
       std::vector<int64_t>(), // outSplitSizes
       -1, // globalRankStart
       -1, // globalRankStride
-      this->getSize()); // worldSize
+      this->getSize(), // worldSize
+      opts.asyncOp); // async_op
 
   return collective(
       inputTensors,
diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp
@@ -367,6 +367,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
   const std::string& logPrefix() const;
 
+  c10::DeviceIndex guessDeviceId() const;
+
  protected:
   std::unordered_map<std::string, std::pair<at::xpu::XPUStream, ccl::stream>>
       xcclStreamsMap_;
@@ -465,41 +467,42 @@ inline std::string reduceOpToString(c10d::ReduceOp op) {
 // Since the current profiler trace support for XCCL is unclear, wrap
 // `RECORD_PARAM_COMMS_DATA` and output parameters as debug logs.
 // export TORCH_CPP_LOG_LEVEL=INFO
-#define RECORD_PARAM_COMMS_DATA_WITH_LOG(                                   \
-    seq,                                                                    \
-    pg_name_tuple,                                                          \
-    inputTensors,                                                           \
-    outputTensors,                                                          \
-    rank,                                                                   \
-    collective_name,                                                        \
-    inNelems,                                                               \
-    outNelems,                                                              \
-    dType,                                                                  \
-    inSplitSizes,                                                           \
-    outSplitSizes,                                                          \
-    globalRankStart,                                                        \
-    globalRankStride,                                                       \
-    worldSize)                                                              \
-  do {                                                                      \
-    LOG(INFO) << "collective_name: " << collective_name                     \
-              << ", inNelems: " << inNelems << ", outNelems: " << outNelems \
-              << ", dType: " << dType << ", root/src rank: " << rank        \
-              << ", worldSize: " << worldSize;                              \
-    RECORD_PARAM_COMMS_DATA(                                                \
-        seq,                                                                \
-        pg_name_tuple,                                                      \
-        inputTensors,                                                       \
-        outputTensors,                                                      \
-        rank,                                                               \
-        collective_name,                                                    \
-        inNelems,                                                           \
-        outNelems,                                                          \
-        dType,                                                              \
-        inSplitSizes,                                                       \
-        outSplitSizes,                                                      \
-        globalRankStart,                                                    \
-        globalRankStride,                                                   \
-        worldSize);                                                         \
+#define RECORD_PARAM_COMMS_DATA_WITH_LOG(                                    \
+    seq,                                                                     \
+    pg_name_tuple,                                                           \
+    inputTensors,                                                            \
+    outputTensors,                                                           \
+    rank,                                                                    \
+    collective_name,                                                         \
+    inNelems,                                                                \
+    outNelems,                                                               \
+    dType,                                                                   \
+    inSplitSizes,                                                            \
+    outSplitSizes,                                                           \
+    globalRankStart,                                                         \
+    globalRankStride,                                                        \
+    worldSize,                                                               \
+    async_op)                                                                \
+  do {                                                                       \
+    LOG(INFO) << std::boolalpha << "collective_name: " << collective_name    \
+              << ", inNelems: " << inNelems << ", outNelems: " << outNelems  \
+              << ", dType: " << dType << ", root/src rank: " << rank         \
+              << ", worldSize: " << worldSize << ", async_op: " << async_op; \
+    RECORD_PARAM_COMMS_DATA(                                                 \
+        seq,                                                                 \
+        pg_name_tuple,                                                       \
+        inputTensors,                                                        \
+        outputTensors,                                                       \
+        rank,                                                                \
+        collective_name,                                                     \
+        inNelems,                                                            \
+        outNelems,                                                           \
+        dType,                                                               \
+        inSplitSizes,                                                        \
+        outSplitSizes,                                                       \
+        globalRankStart,                                                     \
+        globalRankStride,                                                    \
+        worldSize);                                                          \
   } while (0)
 } // namespace
 #endif // USE_C10D_XCCL