MNNVL fix (#604)

chhwang · web-flow · commit 1cc1b827f4c9 · 2025-08-08T19:23:55.000Z
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
@@ -204,7 +204,7 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
   if (getHostHash() == this->hostHash && getPidHash() == this->pidHash) {
     // The memory is local to the process, so originalDataPtr is valid as is
     this->data = this->originalDataPtr;
-  } else if (transports.has(Transport::CudaIpc) && getHostHash() == this->hostHash) {
+  } else if (transports.has(Transport::CudaIpc)) {
     // The memory is local to the machine but not to the process, so we need to open the CUDA IPC handle
     auto entry = getTransportInfo(Transport::CudaIpc);
     void* base;
diff --git a/src/semaphore.cc b/src/semaphore.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/semaphore.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 #include "api.h"
 #include "atomic.hpp"
@@ -26,6 +27,11 @@ struct SemaphoreStub::Impl {
 };
 
 static std::shared_ptr<uint64_t> gpuCallocToken() {
+#if (CUDA_NVLS_API_AVAILABLE)
+  if (isNvlsSupported()) {
+    return detail::gpuCallocPhysicalShared<uint64_t>(1, 0);
+  }
+#endif  // CUDA_NVLS_API_AVAILABLE
 #if defined(MSCCLPP_DEVICE_HIP)
   return detail::gpuCallocUncachedShared<uint64_t>();
 #else   // !defined(MSCCLPP_DEVICE_HIP)