Skip to content

Commit c580e4c

Browse files
chhwangBinyang2014
andauthored
Support CudaIpc connection within a single process (#593)
* Allow CudaIpc connection between GPUs in a single process * Added an example of connection in a single process * Minor interface updates --------- Co-authored-by: Binyang Li <[email protected]>
1 parent c3b47c5 commit c580e4c

File tree

13 files changed

+262
-49
lines changed

13 files changed

+262
-49
lines changed

apps/nccl/src/nccl.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -660,7 +660,7 @@ NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) {
660660
WARN("uniqueId is nullptr");
661661
return ncclInvalidArgument;
662662
}
663-
if (MSCCLPP_UNIQUE_ID_BYTES != NCCL_UNIQUE_ID_BYTES) return ncclInternalError;
663+
if (mscclpp::UniqueIdBytes != NCCL_UNIQUE_ID_BYTES) return ncclInternalError;
664664
mscclpp::UniqueId id = mscclpp::TcpBootstrap::createUniqueId();
665665
memcpy(uniqueId, &id, sizeof(ncclUniqueId));
666666
return ncclSuccess;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
gpu_ping_pong
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
CUDA_HOME ?= /usr/local/cuda
2+
ROCM_HOME ?= /opt/rocm
3+
4+
# Check if nvcc exists, otherwise use hipcc
5+
ifeq ($(shell which $(CUDA_HOME)/bin/nvcc 2>/dev/null),)
6+
COMPILER := $(ROCM_HOME)/bin/hipcc
7+
ARCH_FLAG := -D__HIP_PLATFORM_AMD__=1
8+
else
9+
COMPILER := $(CUDA_HOME)/bin/nvcc
10+
ARCH_FLAG := -arch=native
11+
endif
12+
13+
TARGET = gpu_ping_pong
14+
SRC = gpu_ping_pong.cu
15+
16+
all: $(TARGET)
17+
18+
$(TARGET): $(SRC)
19+
$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
20+
21+
clean:
22+
rm -f $(TARGET)
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
4+
#include <iostream>
5+
#include <mscclpp/core.hpp>
6+
#include <mscclpp/gpu_utils.hpp>
7+
#include <mscclpp/memory_channel.hpp>
8+
#include <mscclpp/memory_channel_device.hpp>
9+
#include <sstream>
10+
11+
template <typename... Args>
12+
void log(Args &&...args) {
13+
std::stringstream ss;
14+
(ss << ... << args);
15+
ss << std::endl;
16+
std::cout << ss.str();
17+
}
18+
19+
__device__ void spin_cycles(unsigned long long cycles) {
20+
unsigned long long start = clock64();
21+
while (clock64() - start < cycles) {
22+
// spin
23+
}
24+
}
25+
26+
__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
27+
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
28+
for (int i = 0; i < iter; ++i) {
29+
devHandle->relaxedWait();
30+
// spin for a few ms
31+
spin_cycles(1e7);
32+
devHandle->relaxedSignal();
33+
}
34+
}
35+
}
36+
37+
__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
38+
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
39+
for (int i = 0; i < iter; ++i) {
40+
devHandle->relaxedSignal();
41+
devHandle->relaxedWait();
42+
}
43+
}
44+
}
45+
46+
int main() {
47+
// Optional: check if we have at least two GPUs
48+
int deviceCount;
49+
MSCCLPP_CUDATHROW(cudaGetDeviceCount(&deviceCount));
50+
if (deviceCount < 2) {
51+
log("Error: At least two GPUs are required.");
52+
return 1;
53+
}
54+
55+
// Optional: check if the two GPUs can peer-to-peer access each other
56+
int canAccessPeer;
57+
MSCCLPP_CUDATHROW(cudaDeviceCanAccessPeer(&canAccessPeer, 0, 1));
58+
if (!canAccessPeer) {
59+
log("Error: GPU 0 cannot access GPU 1. Make sure that the GPUs are connected peer-to-peer. You can check this "
60+
"by running `nvidia-smi topo -m` (the connection between GPU 0 and 1 should be either NV# or PIX).");
61+
return 1;
62+
}
63+
64+
const int iter = 100;
65+
const mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
66+
67+
log("Creating endpoints ...");
68+
69+
auto ctx = mscclpp::Context::create();
70+
mscclpp::Endpoint ep0 = ctx->createEndpoint({transport, {mscclpp::DeviceType::GPU, 0}});
71+
mscclpp::Endpoint ep1 = ctx->createEndpoint({transport, {mscclpp::DeviceType::GPU, 1}});
72+
73+
log("GPU 0: Creating a connection and a semaphore stub ...");
74+
75+
MSCCLPP_CUDATHROW(cudaSetDevice(0));
76+
std::shared_ptr<mscclpp::Connection> conn0 = ctx->connect(/*localEndpoint*/ ep0, /*remoteEndpoint*/ ep1);
77+
mscclpp::SemaphoreStub semaStub0(conn0);
78+
79+
log("GPU 1: Creating a connection and a semaphore stub ...");
80+
81+
MSCCLPP_CUDATHROW(cudaSetDevice(1));
82+
std::shared_ptr<mscclpp::Connection> conn1 = ctx->connect(/*localEndpoint*/ ep1, /*remoteEndpoint*/ ep0);
83+
mscclpp::SemaphoreStub semaStub1(conn1);
84+
85+
log("GPU 0: Creating a semaphore and a memory channel ...");
86+
87+
MSCCLPP_CUDATHROW(cudaSetDevice(0));
88+
mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1);
89+
mscclpp::BaseMemoryChannel memChan0(sema0);
90+
mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle();
91+
void *devHandle0;
92+
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
93+
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice));
94+
95+
log("GPU 1: Creating a semaphore and a memory channel ...");
96+
97+
MSCCLPP_CUDATHROW(cudaSetDevice(1));
98+
mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0);
99+
mscclpp::BaseMemoryChannel memChan1(sema1);
100+
mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle();
101+
void *devHandle1;
102+
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle1, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
103+
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle1, &memChanHandle1, sizeof(memChanHandle1), cudaMemcpyHostToDevice));
104+
105+
log("GPU 0: Launching gpuKernel0 ...");
106+
107+
MSCCLPP_CUDATHROW(cudaSetDevice(0));
108+
gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle0), iter);
109+
MSCCLPP_CUDATHROW(cudaGetLastError());
110+
111+
log("GPU 1: Launching gpuKernel1 ...");
112+
113+
MSCCLPP_CUDATHROW(cudaSetDevice(1));
114+
cudaEvent_t start, end;
115+
MSCCLPP_CUDATHROW(cudaEventCreate(&start));
116+
MSCCLPP_CUDATHROW(cudaEventCreate(&end));
117+
MSCCLPP_CUDATHROW(cudaEventRecord(start));
118+
gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle1), iter);
119+
MSCCLPP_CUDATHROW(cudaGetLastError());
120+
MSCCLPP_CUDATHROW(cudaEventRecord(end));
121+
MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
122+
123+
float elapsedMs;
124+
MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedMs, start, end));
125+
126+
MSCCLPP_CUDATHROW(cudaSetDevice(0));
127+
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
128+
129+
float msPerIter = elapsedMs / iter;
130+
log("Elapsed ", msPerIter, " ms per iteration (", iter, ")");
131+
if (msPerIter < 1.0f) {
132+
log("Failed: the elapsed time per iteration is less than 1 ms, which may indicate that the relaxedSignal "
133+
"and relaxedWait are not working as expected.");
134+
return 1;
135+
}
136+
log("Succeed!");
137+
return 0;
138+
}

include/mscclpp/core.hpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919

2020
namespace mscclpp {
2121

22-
#define MSCCLPP_UNIQUE_ID_BYTES 128
22+
constexpr unsigned int UniqueIdBytes = 128;
2323

2424
/// Unique ID for initializing the TcpBootstrap.
25-
using UniqueId = std::array<uint8_t, MSCCLPP_UNIQUE_ID_BYTES>;
25+
using UniqueId = std::array<uint8_t, UniqueIdBytes>;
2626

2727
/// Return a version string.
2828
/// @return The MSCCL++ version string in "major.minor.patch" format.
@@ -207,7 +207,6 @@ class TcpBootstrap : public Bootstrap {
207207
enum class Transport {
208208
Unknown, // Unknown transport type.
209209
CudaIpc, // CUDA IPC transport type.
210-
Nvls, // NVLS transport type.
211210
IB0, // InfiniBand device 0 transport type.
212211
IB1, // InfiniBand device 1 transport type.
213212
IB2, // InfiniBand device 2 transport type.
@@ -221,7 +220,7 @@ enum class Transport {
221220
};
222221

223222
namespace detail {
224-
const size_t TransportFlagsSize = 12;
223+
const size_t TransportFlagsSize = 11;
225224
static_assert(TransportFlagsSize == static_cast<size_t>(Transport::NumTransports),
226225
"TransportFlagsSize must match the number of transports");
227226
/// Bitset for storing transport flags.
@@ -441,6 +440,14 @@ class Endpoint {
441440
/// @return The device used.
442441
const Device& device() const;
443442

443+
/// Get the host hash.
444+
/// @return The host hash.
445+
uint64_t hostHash() const;
446+
447+
/// Get the process ID hash.
448+
/// @return The process ID hash.
449+
uint64_t pidHash() const;
450+
444451
/// Get the maximum write queue size.
445452
/// @return The maximum number of write requests that can be queued.
446453
int maxWriteQueueSize() const;
@@ -467,9 +474,9 @@ class Endpoint {
467474
class Connection {
468475
public:
469476
/// Constructor.
477+
/// @param context The context associated with the connection.
470478
/// @param localEndpoint The local endpoint of the connection.
471-
Connection(std::shared_ptr<Context> context, const Endpoint& localEndpoint)
472-
: context_(context), localEndpoint_(localEndpoint), maxWriteQueueSize_(localEndpoint.maxWriteQueueSize()) {}
479+
Connection(std::shared_ptr<Context> context, const Endpoint& localEndpoint);
473480

474481
/// Destructor.
475482
virtual ~Connection() = default;
@@ -506,7 +513,7 @@ class Connection {
506513

507514
/// Get the context associated with this connection.
508515
/// @return A shared pointer to the context associated with this connection.
509-
std::shared_ptr<Context> context() const { return context_; }
516+
std::shared_ptr<Context> context() const;
510517

511518
/// Get the device used by the local endpoint.
512519
/// @return The device used by the local endpoint.

include/mscclpp/gpu.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <hip/hip_runtime.h>
1010

1111
using cudaError_t = hipError_t;
12+
using cudaEvent_t = hipEvent_t;
1213
using cudaGraph_t = hipGraph_t;
1314
using cudaGraphExec_t = hipGraphExec_t;
1415
using cudaDeviceProp = hipDeviceProp_t;
@@ -24,6 +25,7 @@ using CUmemAllocationProp = hipMemAllocationProp;
2425
using CUmemAccessDesc = hipMemAccessDesc;
2526
using CUmemAllocationHandleType = hipMemAllocationHandleType;
2627

28+
constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled;
2729
constexpr auto cudaSuccess = hipSuccess;
2830
constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
2931
constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
@@ -45,6 +47,12 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
4547
#define CUDA_SUCCESS hipSuccess
4648
#endif // CUDA_SUCCESS
4749

50+
#define cudaEventCreate(...) hipEventCreate(__VA_ARGS__)
51+
#define cudaEventCreateWithFlags(...) hipEventCreateWithFlags(__VA_ARGS__)
52+
#define cudaEventDestroy(...) hipEventDestroy(__VA_ARGS__)
53+
#define cudaEventRecord(...) hipEventRecord(__VA_ARGS__)
54+
#define cudaEventSynchronize(...) hipEventSynchronize(__VA_ARGS__)
55+
#define cudaEventElapsedTime(...) hipEventElapsedTime(__VA_ARGS__)
4856
#define cudaGetErrorString(...) hipGetErrorString(__VA_ARGS__)
4957
#define cudaGetDevice(...) hipGetDevice(__VA_ARGS__)
5058
#define cudaGetDeviceCount(...) hipGetDeviceCount(__VA_ARGS__)
@@ -53,6 +61,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
5361
#define cudaSetDevice(...) hipSetDevice(__VA_ARGS__)
5462
#define cudaDeviceSynchronize(...) hipDeviceSynchronize(__VA_ARGS__)
5563
#define cudaDeviceGetPCIBusId(...) hipDeviceGetPCIBusId(__VA_ARGS__)
64+
#define cudaDeviceCanAccessPeer(...) hipDeviceCanAccessPeer(__VA_ARGS__)
65+
#define cudaDeviceEnablePeerAccess(...) hipDeviceEnablePeerAccess(__VA_ARGS__)
5666
#define cudaHostAlloc(...) hipHostMalloc(__VA_ARGS__)
5767
#define cudaMalloc(...) hipMalloc(__VA_ARGS__)
5868
#define cudaFree(...) hipFree(__VA_ARGS__)

include/mscclpp/port_channel_device.hpp

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -18,35 +18,35 @@ using SemaphoreId = uint32_t;
1818
using MemoryId = uint32_t;
1919

2020
using TriggerType = uint64_t;
21-
const TriggerType TriggerData = 0x1; // Trigger a data transfer.
22-
const TriggerType TriggerFlag = 0x2; // Trigger a signaling.
23-
const TriggerType TriggerSync = 0x4; // Trigger a flush.
21+
constexpr TriggerType TriggerData = 0x1; // Trigger a data transfer.
22+
constexpr TriggerType TriggerFlag = 0x2; // Trigger a signaling.
23+
constexpr TriggerType TriggerSync = 0x4; // Trigger a flush.
2424

25-
#define MSCCLPP_BITS_SIZE 32
26-
#define MSCCLPP_BITS_OFFSET 32
27-
#define MSCCLPP_BITS_MEMORY_ID 9
28-
#define MSCCLPP_BITS_TYPE 3
29-
#define MSCCLPP_BITS_SEMAPHORE_ID 10
30-
#define MSCCLPP_BITS_FIFO_RESERVED 1
25+
constexpr unsigned int TriggerBitsSize = 32;
26+
constexpr unsigned int TriggerBitsOffset = 32;
27+
constexpr unsigned int TriggerBitsMemoryId = 9;
28+
constexpr unsigned int TriggerBitsType = 3;
29+
constexpr unsigned int TriggerBitsSemaphoreId = 10;
30+
constexpr unsigned int TriggerBitsFifoReserved = 1;
3131

3232
/// Basic structure of each work element in the FIFO.
3333
union ChannelTrigger {
3434
ProxyTrigger value;
3535
// The summation of number of bits must be 128 or less.
3636
struct {
3737
// First 64 bits: value[0]
38-
uint64_t size : MSCCLPP_BITS_SIZE;
39-
uint64_t srcOffset : MSCCLPP_BITS_OFFSET;
40-
uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment
38+
uint64_t size : TriggerBitsSize;
39+
uint64_t srcOffset : TriggerBitsOffset;
40+
uint64_t : (64 - TriggerBitsSize - TriggerBitsOffset); // ensure 64-bit alignment
4141
// Second 64 bits: value[1]
42-
uint64_t dstOffset : MSCCLPP_BITS_OFFSET;
43-
uint64_t srcMemoryId : MSCCLPP_BITS_MEMORY_ID;
44-
uint64_t dstMemoryId : MSCCLPP_BITS_MEMORY_ID;
45-
uint64_t type : MSCCLPP_BITS_TYPE;
46-
uint64_t semaphoreId : MSCCLPP_BITS_SEMAPHORE_ID;
47-
uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_MEMORY_ID - MSCCLPP_BITS_MEMORY_ID - MSCCLPP_BITS_TYPE -
48-
MSCCLPP_BITS_SEMAPHORE_ID - MSCCLPP_BITS_FIFO_RESERVED); // ensure 64-bit alignment
49-
uint64_t reserved : MSCCLPP_BITS_FIFO_RESERVED;
42+
uint64_t dstOffset : TriggerBitsOffset;
43+
uint64_t srcMemoryId : TriggerBitsMemoryId;
44+
uint64_t dstMemoryId : TriggerBitsMemoryId;
45+
uint64_t type : TriggerBitsType;
46+
uint64_t semaphoreId : TriggerBitsSemaphoreId;
47+
uint64_t : (64 - TriggerBitsOffset - TriggerBitsMemoryId - TriggerBitsMemoryId - TriggerBitsType -
48+
TriggerBitsSemaphoreId - TriggerBitsFifoReserved); // ensure 64-bit alignment
49+
uint64_t reserved : TriggerBitsFifoReserved;
5050
} fields;
5151

5252
#if defined(MSCCLPP_DEVICE_COMPILE)
@@ -66,28 +66,28 @@ union ChannelTrigger {
6666
/// @param semaphoreId The ID of the semaphore.
6767
MSCCLPP_DEVICE_INLINE ChannelTrigger(TriggerType type, MemoryId dst, uint64_t dstOffset, MemoryId src,
6868
uint64_t srcOffset, uint64_t bytes, int semaphoreId) {
69-
MSCCLPP_ASSERT_DEVICE(type < (1ULL << MSCCLPP_BITS_TYPE), "type is too large");
70-
MSCCLPP_ASSERT_DEVICE(dst < (1ULL << MSCCLPP_BITS_MEMORY_ID), "dst is too large");
71-
MSCCLPP_ASSERT_DEVICE(dstOffset < (1ULL << MSCCLPP_BITS_OFFSET), "dstOffset is too large");
72-
MSCCLPP_ASSERT_DEVICE(src < (1ULL << MSCCLPP_BITS_MEMORY_ID), "src is too large");
73-
MSCCLPP_ASSERT_DEVICE(srcOffset < (1ULL << MSCCLPP_BITS_OFFSET), "srcOffset is too large");
69+
MSCCLPP_ASSERT_DEVICE(type < (1ULL << TriggerBitsType), "type is too large");
70+
MSCCLPP_ASSERT_DEVICE(dst < (1ULL << TriggerBitsMemoryId), "dst is too large");
71+
MSCCLPP_ASSERT_DEVICE(dstOffset < (1ULL << TriggerBitsOffset), "dstOffset is too large");
72+
MSCCLPP_ASSERT_DEVICE(src < (1ULL << TriggerBitsMemoryId), "src is too large");
73+
MSCCLPP_ASSERT_DEVICE(srcOffset < (1ULL << TriggerBitsOffset), "srcOffset is too large");
7474
MSCCLPP_ASSERT_DEVICE(bytes != 0, "bytes must not be zero");
75-
MSCCLPP_ASSERT_DEVICE(bytes < (1ULL << MSCCLPP_BITS_SIZE), "bytes is too large");
76-
MSCCLPP_ASSERT_DEVICE(semaphoreId < (1ULL << MSCCLPP_BITS_SEMAPHORE_ID), "semaphoreId is too large");
77-
constexpr uint64_t maskSize = (1ULL << MSCCLPP_BITS_SIZE) - 1;
78-
constexpr uint64_t maskSrcOffset = (1ULL << MSCCLPP_BITS_OFFSET) - 1;
79-
constexpr uint64_t maskDstOffset = (1ULL << MSCCLPP_BITS_OFFSET) - 1;
80-
constexpr uint64_t maskSrcMemoryId = (1ULL << MSCCLPP_BITS_MEMORY_ID) - 1;
81-
constexpr uint64_t maskDstMemoryId = (1ULL << MSCCLPP_BITS_MEMORY_ID) - 1;
82-
constexpr uint64_t maskType = (1ULL << MSCCLPP_BITS_TYPE) - 1;
83-
constexpr uint64_t maskSemaphoreId = (1ULL << MSCCLPP_BITS_SEMAPHORE_ID) - 1;
84-
value.fst = (((srcOffset & maskSrcOffset) << MSCCLPP_BITS_SIZE) + (bytes & maskSize));
85-
value.snd = (((((((((semaphoreId & maskSemaphoreId) << MSCCLPP_BITS_TYPE) + ((uint64_t)type & maskType))
86-
<< MSCCLPP_BITS_MEMORY_ID) +
75+
MSCCLPP_ASSERT_DEVICE(bytes < (1ULL << TriggerBitsSize), "bytes is too large");
76+
MSCCLPP_ASSERT_DEVICE(semaphoreId < (1ULL << TriggerBitsSemaphoreId), "semaphoreId is too large");
77+
constexpr uint64_t maskSize = (1ULL << TriggerBitsSize) - 1;
78+
constexpr uint64_t maskSrcOffset = (1ULL << TriggerBitsOffset) - 1;
79+
constexpr uint64_t maskDstOffset = (1ULL << TriggerBitsOffset) - 1;
80+
constexpr uint64_t maskSrcMemoryId = (1ULL << TriggerBitsMemoryId) - 1;
81+
constexpr uint64_t maskDstMemoryId = (1ULL << TriggerBitsMemoryId) - 1;
82+
constexpr uint64_t maskType = (1ULL << TriggerBitsType) - 1;
83+
constexpr uint64_t maskSemaphoreId = (1ULL << TriggerBitsSemaphoreId) - 1;
84+
value.fst = (((srcOffset & maskSrcOffset) << TriggerBitsSize) + (bytes & maskSize));
85+
value.snd = (((((((((semaphoreId & maskSemaphoreId) << TriggerBitsType) + ((uint64_t)type & maskType))
86+
<< TriggerBitsMemoryId) +
8787
(dst & maskDstMemoryId))
88-
<< MSCCLPP_BITS_MEMORY_ID) +
88+
<< TriggerBitsMemoryId) +
8989
(src & maskSrcMemoryId))
90-
<< MSCCLPP_BITS_OFFSET) +
90+
<< TriggerBitsOffset) +
9191
(dstOffset & maskDstOffset));
9292
}
9393
#endif // defined(MSCCLPP_DEVICE_COMPILE)

python/mscclpp/core_py.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ void register_core(nb::module_& m) {
7777
nb::enum_<Transport>(m, "Transport")
7878
.value("Unknown", Transport::Unknown)
7979
.value("CudaIpc", Transport::CudaIpc)
80-
.value("Nvls", Transport::Nvls)
8180
.value("IB0", Transport::IB0)
8281
.value("IB1", Transport::IB1)
8382
.value("IB2", Transport::IB2)

0 commit comments

Comments
 (0)