diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 4fdc8dd83..0bdc74f93 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -533,6 +533,7 @@ class Context : public std::enable_shared_from_this { friend class Endpoint; friend class Connection; friend class RegisteredMemory; + friend class SemaphoreStub; }; /// Block of memory that has been registered to a Context. diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 1741a26fe..97b278e38 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -24,6 +24,7 @@ using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t; using CUmemAllocationProp = hipMemAllocationProp; using CUmemAccessDesc = hipMemAccessDesc; using CUmemAllocationHandleType = hipMemAllocationHandleType; +using CUmemAllocationGranularity_flags = hipMemAllocationGranularity_flags; constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled; constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed; @@ -44,6 +45,7 @@ constexpr auto CU_MEM_ALLOCATION_TYPE_PINNED = hipMemAllocationTypePinned; constexpr auto CU_MEM_LOCATION_TYPE_DEVICE = hipMemLocationTypeDevice; constexpr auto CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = hipMemHandleTypePosixFileDescriptor; constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWrite; +constexpr auto CU_MEM_ALLOC_GRANULARITY_MINIMUM = hipMemAllocationGranularityMinimum; #ifndef CUDA_SUCCESS #define CUDA_SUCCESS hipSuccess @@ -106,6 +108,7 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri #define cuMemRetainAllocationHandle(...) hipMemRetainAllocationHandle(__VA_ARGS__) #define cuMemExportToShareableHandle(...) hipMemExportToShareableHandle(__VA_ARGS__) #define cuMemImportFromShareableHandle(...) hipMemImportFromShareableHandle(__VA_ARGS__) +#define cuMemGetAllocationGranularity(...) hipMemGetAllocationGranularity(__VA_ARGS__) #else diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 7eb67ea1f..0ab616e04 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -254,6 +254,7 @@ auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0, size_t align = } size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag); +size_t getCuAllocationGranularity(CUmemAllocationGranularity_flags granFlag); #endif // CUDA_NVLS_API_AVAILABLE diff --git a/src/context.cc b/src/context.cc index 610ad27e1..0f5045472 100644 --- a/src/context.cc +++ b/src/context.cc @@ -57,6 +57,13 @@ IbCtx *Context::Impl::getIbContext(Transport ibTransport) { return it->second.get(); } +std::shared_ptr Context::Impl::getToken() { + if (!tokenPool_) { + tokenPool_ = std::make_shared(maxNumTokens_); + } + return tokenPool_->getToken(); +} + MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique()) {} MSCCLPP_API_CPP Context::~Context() = default; diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc index f844cb40c..6687533d5 100644 --- a/src/gpu_utils.cc +++ b/src/gpu_utils.cc @@ -167,6 +167,21 @@ void* gpuCallocUncached(size_t bytes) { #endif // defined(__HIP_PLATFORM_AMD__) #if (CUDA_NVLS_API_AVAILABLE) +size_t getCuAllocationGranularity(CUmemAllocationGranularity_flags granFlag) { + size_t gran = 0; + int deviceId = -1; + MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId)); + + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = deviceId; + prop.requestedHandleTypes = + (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC); + cuMemGetAllocationGranularity(&gran, &prop, granFlag); + return gran; +} + size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) { size_t gran = 0; int numDevices = 0; diff --git a/src/include/context.hpp b/src/include/context.hpp index 51cf3c375..b53a2662a 100644 --- a/src/include/context.hpp +++ b/src/include/context.hpp @@ -35,13 +35,17 @@ class CudaIpcStream { int deviceId() const { return deviceId_; } }; +class TokenPool; struct Context::Impl { std::unordered_map> ibContexts_; std::vector> ipcStreams_; + std::shared_ptr tokenPool_; + const size_t maxNumTokens_ = 1 << 15; // 32K tokens Impl(); IbCtx *getIbContext(Transport ibTransport); + std::shared_ptr getToken(); }; } // namespace mscclpp diff --git a/src/include/utils_internal.hpp b/src/include/utils_internal.hpp index d413b3f8f..c5c67e26c 100644 --- a/src/include/utils_internal.hpp +++ b/src/include/utils_internal.hpp @@ -64,6 +64,19 @@ struct PairHash { } }; +class TokenPool : public std::enable_shared_from_this { + public: + TokenPool(size_t nTokens); + std::shared_ptr getToken(); + + private: + size_t nToken_; + uint64_t* baseAddr_; + uint64_t tailMask_; + std::shared_ptr tokens_; + std::vector> allocationMap_; +}; + } // namespace mscclpp #endif diff --git a/src/semaphore.cc b/src/semaphore.cc index ce9842418..a514542f1 100644 --- a/src/semaphore.cc +++ b/src/semaphore.cc @@ -20,18 +20,20 @@ struct SemaphoreStub::Impl { Impl(const std::vector& data); + std::shared_ptr gpuCallocToken(std::shared_ptr context); + std::shared_ptr connection_; std::shared_ptr token_; RegisteredMemory idMemory_; Device device_; }; -static std::shared_ptr gpuCallocToken() { -// #if (CUDA_NVLS_API_AVAILABLE) -// if (isNvlsSupported()) { -// return detail::gpuCallocPhysicalShared(1, 0); -// } -// #endif // CUDA_NVLS_API_AVAILABLE +std::shared_ptr SemaphoreStub::Impl::gpuCallocToken(std::shared_ptr context) { +#if (CUDA_NVLS_API_AVAILABLE) + if (isNvlsSupported()) { + return context->pimpl_->getToken(); + } +#endif // CUDA_NVLS_API_AVAILABLE #if defined(MSCCLPP_DEVICE_HIP) return detail::gpuCallocUncachedShared(); #else // !defined(MSCCLPP_DEVICE_HIP) @@ -49,7 +51,7 @@ SemaphoreStub::Impl::Impl(std::shared_ptr connection) : connection_( throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage); } MSCCLPP_CUDATHROW(cudaSetDevice(localDevice.id)); - token_ = gpuCallocToken(); + token_ = gpuCallocToken(connection_->context()); } else { throw Error("Unsupported local device type", ErrorCode::InvalidUsage); } diff --git a/src/utils_internal.cc b/src/utils_internal.cc index e5f16370c..9504a52cf 100644 --- a/src/utils_internal.cc +++ b/src/utils_internal.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -232,4 +233,42 @@ void getRandomData(void* buffer, size_t bytes) { } } +TokenPool::TokenPool(size_t nToken) : nToken_(nToken) { +#if (CUDA_NVLS_API_AVAILABLE) + tokens_ = detail::gpuCallocPhysicalShared( + nToken, detail::getCuAllocationGranularity(CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + MSCCLPP_CUTHROW(cuMemGetAddressRange((CUdeviceptr*)(&baseAddr_), NULL, (CUdeviceptr)tokens_.get())); + size_t nElems = (nToken + (UINT64_WIDTH - 1)) / UINT64_WIDTH; + allocationMap_.resize(nElems, 0); + tailMask_ = (nToken % UINT64_WIDTH) ? ((1UL << (nToken % UINT64_WIDTH)) - 1) : ~0UL; +#else + throw Error("TokenPool only available on GPUs with NVLS support", ErrorCode::InvalidUsage); +#endif +} + +std::shared_ptr TokenPool::getToken() { + auto deleter = [self = shared_from_this()](uint64_t* token) { + size_t index = (token - self->baseAddr_) / UINT64_WIDTH; + size_t bit = (token - self->baseAddr_) % UINT64_WIDTH; + uint64_t mask = 1UL << bit; + self->allocationMap_[index] &= ~mask; + }; + + size_t size = allocationMap_.size(); + for (size_t i = 0; i < size; i++) { + uint64_t ullong = allocationMap_[i].to_ullong(); + uint64_t mask = (i + 1 == size) ? tailMask_ : ~0ULL; + uint64_t holes = (~ullong) & mask; + if (!holes) continue; + for (int bit = 0; bit < UINT64_WIDTH; bit++) { + if (holes & (1UL << bit)) { + allocationMap_[i].set(bit); + INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", baseAddr_ + i * UINT64_WIDTH + bit); + return std::shared_ptr(baseAddr_ + i * UINT64_WIDTH + bit, deleter); + } + } + } + throw Error("TokenPool is exhausted", ErrorCode::InternalError); +} + } // namespace mscclpp