NVIDIA · saivishal1999 · Feb 27, 2026 · Feb 27, 2026 · Mar 2, 2026 · Mar 3, 2026
diff --git a/1 b/1
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+export CC=clang-20
+export CXX=clang++-20
+export CUDAHOSTCXX=/usr/bin/clang++-20
+export LDFLAGS="-fuse-ld=mold"
+
+export NVFUSER_BUILD_ENABLE_PCH
+
+export UCC_HOME="/opt/hpcx/ucc"
+export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc"
+export UCX_HOME="/opt/hpcx/ucx"
+export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx"
+
+# export TORCH_CUDA_ARCH_LIST="9.0"
+
+export NVFUSER_BUILD_WITH_UCC=1
+export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser
+export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY
+
+# Enable debug mode, leave empty for non-debug compilation
+export NVFUSER_BUILD_BUILD_TYPE=Debug
+export RUN_CMAKE=""
+
+pip install -v -e ./python --no-build-isolation
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
@@ -14,7 +14,13 @@
 #include <numeric>
 
 #ifdef NVFUSER_DISTRIBUTED
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+#include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
+#endif
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#endif
 #include <torch/csrc/distributed/c10d/exception.h>
 #ifdef USE_C10D_NCCL
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
@@ -362,6 +368,12 @@ void Communicator::cleanup() {
       pg_nccl->shutdown();
     }
   }
+#endif
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+  for (const auto& entry : process_groups_) {
+    c10d::unregister_process_group(entry.first);
+  }
+  process_groups_.clear();
-  for (const auto& entry : process_groups_) {
-    c10d::unregister_process_group(entry.first);
-  }
-  process_groups_.clear();
+#if defined(USE_DISTRIBUTED)
+  for (const auto& entry : process_groups_) {
+    c10d::unregister_process_group(entry.first);
+  }
+  process_groups_.clear();
+#endif
-  for (const auto& entry : process_groups_) {
-    c10d::unregister_process_group(entry.first);
-  }
-  process_groups_.clear();
+#if defined(USE_DISTRIBUTED)
+  for (const auto& entry : process_groups_) {
+    c10d::unregister_process_group(entry.first);
+  }
+  process_groups_.clear();
+#endif
 #endif
   backends_.clear();
 }
@@ -402,6 +414,28 @@ c10d::Backend* Communicator::getBackendForTeam(
     }();
 #else
     backends_[team_key] = nullptr;
+#endif
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+    std::optional<c10d::ProcessGroup::BackendType> pg_backend =
+    (b == CommunicatorBackend::kNccl)
+        ? std::optional<c10d::ProcessGroup::BackendType>(
+              c10d::ProcessGroup::BackendType::NCCL)
+        : std::nullopt;
+    if (backends_[team_key] != nullptr && pg_backend.has_value()) {
+      auto rank_it = std::find(team.begin(), team.end(), deviceId());
+      RankType team_rank = std::distance(team.begin(), rank_it);
+
+      auto pg = c10::make_intrusive<c10d::ProcessGroup>(
+          c10::make_intrusive<c10d::PrefixStore>(team_key, store_),
+          team_rank,
+          static_cast<int>(team.size()));
+      pg->setBackend(c10::DeviceType::CUDA, *pg_backend, backends_[team_key]);
+      pg->setDefaultBackend(*pg_backend);
+      pg->setGroupName(team_key);
+
+      c10d::register_process_group(team_key, pg);
+      process_groups_[team_key] = std::move(pg);
+    }
 #endif
   }
   return backends_.at(team_key).get();
@@ -424,4 +458,13 @@ void Communicator::barrier(std::optional<CommunicatorBackend> backend) {
   getWorld(backend)->barrier(options)->wait();
 }
 
-} // namespace nvfuser
+std::string Communicator::getSymmMemGroupKey(
+  std::optional<CommunicatorBackend> backend) {
+std::vector<RankType> all_ranks(size_);
+std::iota(all_ranks.begin(), all_ranks.end(), 0);
+CommunicatorBackend b = backend.value_or(default_backend_);
+(void)getBackendForTeam(all_ranks, b);
+return getTeamKey(all_ranks, b);
+}
-std::string Communicator::getSymmMemGroupKey(
-  std::optional<CommunicatorBackend> backend) {
-std::vector<RankType> all_ranks(size_);
-std::iota(all_ranks.begin(), all_ranks.end(), 0);
-CommunicatorBackend b = backend.value_or(default_backend_);
-(void)getBackendForTeam(all_ranks, b);
-return getTeamKey(all_ranks, b);
-}
+std::string Communicator::getSymmMemGroupKey(
+    std::optional<CommunicatorBackend> backend) {
+  std::vector<RankType> all_ranks(size_);
+  std::iota(all_ranks.begin(), all_ranks.end(), 0);
+  CommunicatorBackend b = backend.value_or(default_backend_);
+  (void)getBackendForTeam(all_ranks, b);
+  return getTeamKey(all_ranks, b);
+}
-std::string Communicator::getSymmMemGroupKey(
-  std::optional<CommunicatorBackend> backend) {
-std::vector<RankType> all_ranks(size_);
-std::iota(all_ranks.begin(), all_ranks.end(), 0);
-CommunicatorBackend b = backend.value_or(default_backend_);
-(void)getBackendForTeam(all_ranks, b);
-return getTeamKey(all_ranks, b);
-}
+std::string Communicator::getSymmMemGroupKey(
+    std::optional<CommunicatorBackend> backend) {
+  std::vector<RankType> all_ranks(size_);
+  std::iota(all_ranks.begin(), all_ranks.end(), 0);
+  CommunicatorBackend b = backend.value_or(default_backend_);
+  (void)getBackendForTeam(all_ranks, b);
+  return getTeamKey(all_ranks, b);
+}
+
+} // namespace nvfuser
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
@@ -11,8 +11,19 @@
 #include <ATen/core/ivalue.h>
 #include <c10/util/intrusive_ptr.h>
 
+#if defined(NVFUSER_DISTRIBUTED) && \
+    __has_include(<torch/csrc/distributed/c10d/GroupRegistry.hpp>) && \
+    __has_include(<torch/csrc/distributed/c10d/ProcessGroup.hpp>)
+#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 1
+#else
+#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 0
+#endif
+
 #ifdef NVFUSER_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Backend.hpp>
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#endif
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
 #else
@@ -110,6 +121,10 @@ class NVF_API Communicator {
   c10d::Backend* getWorld(
       std::optional<CommunicatorBackend> backend = std::nullopt);
 
+  // Returns the world process-group name for the given backend.
+  std::string getSymmMemGroupKey(
+    std::optional<CommunicatorBackend> backend = std::nullopt);
+
   // returns if a backend is available for creation
   bool isBackendAvailable(CommunicatorBackend backend) const {
     if (backend == CommunicatorBackend::kUcc) {
@@ -153,6 +168,11 @@ class NVF_API Communicator {
   c10::intrusive_ptr<c10d::TCPStore> store_;
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+  // c10d process-group wrappers registered for symmetric-memory rendezvous.
+  std::unordered_map<std::string, c10::intrusive_ptr<c10d::ProcessGroup>>
+      process_groups_;
+#endif
 };
 
 } // namespace nvfuser
diff --git a/csrc/multidevice/ipc_utils.cpp b/csrc/multidevice/ipc_utils.cpp
@@ -191,4 +191,22 @@ MulticastProtocol getMulticastProtocol() {
   return MulticastProtocol::BatchMemcpy;
 }
 
+SymmetricMemoryBackend getSymmetricMemoryBackend() {
+  if (isOptionEnabled(EnableOption::SymmetricMemoryBackend)) {
+    if (hasEnableOptionArgument(
+            EnableOption::SymmetricMemoryBackend, "pytorch_nccl")) {
+      return SymmetricMemoryBackend::PyTorchNccl;
+    }
+    if (hasEnableOptionArgument(
+            EnableOption::SymmetricMemoryBackend, "pytorch_nvshmem")) {
+      return SymmetricMemoryBackend::PyTorchNvshmem;
+    }
+    if (hasEnableOptionArgument(
+            EnableOption::SymmetricMemoryBackend, "pytorch_cuda")) {
+      return SymmetricMemoryBackend::PyTorchCuda;
+    }
+  }
+  return SymmetricMemoryBackend::Native;
+}
+
 } // namespace nvfuser
diff --git a/csrc/multidevice/ipc_utils.h b/csrc/multidevice/ipc_utils.h
@@ -33,6 +33,19 @@ enum class MulticastProtocol { Memcpy, Multimem, BatchMemcpy };
 
 MulticastProtocol getMulticastProtocol();
 
+// Backend for symmetric memory allocation and rendezvous.
+// Native: Fuser's own CUDA VMM + IPC implementation (default, maintained).
+// PyTorch*: Use PyTorch's symmetric memory (torch.distributed._symmetric_memory)
+// with the given transport backend (Nccl, Nvshmem, or Cuda).
+enum class SymmetricMemoryBackend {
+  Native,
+  PyTorchNccl,
+  PyTorchNvshmem,
+  PyTorchCuda,
+};
+
+SymmetricMemoryBackend getSymmetricMemoryBackend();
+
 // Creates a listening Unix domain socket bound to path.
 // If path starts with '@', it uses the abstract namespace (replaced with \0).
 // Returns the socket file descriptor.