@@ -49,15 +49,16 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
4949 }
5050}
5151
52- UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn (const void *address, size_t length)
52+ static UCS_F_ALWAYS_INLINE uint64_t *
53+ ucx_perf_cuda_get_sn (const void *address, size_t length)
5354{
54- return (uint64_t *)UCS_PTR_BYTE_OFFSET (address, length - sizeof ( uint64_t ) );
55+ return (uint64_t *)UCS_PTR_BYTE_OFFSET (address, length);
5556}
5657
57- UCS_F_DEVICE void ucx_perf_cuda_wait_sn (volatile uint64_t *sn, uint64_t value)
58+ UCS_F_DEVICE void ucx_perf_cuda_wait_sn (const uint64_t *sn, uint64_t value)
5859{
5960 if (threadIdx .x == 0 ) {
60- while (*sn < value);
61+ while (ucs_device_atomic64_read (sn) < value);
6162 }
6263 __syncthreads ();
6364}
@@ -79,8 +80,8 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
7980 return count;
8081}
8182
82- UCS_F_DEVICE size_t ucx_bitset_ffns ( const uint8_t *set, size_t bits,
83- size_t from)
83+ UCS_F_DEVICE size_t
84+ ucx_bitset_ffns ( const uint8_t *set, size_t bits, size_t from)
8485{
8586 for (size_t i = from; i < bits; i++) {
8687 if (!UCX_BIT_GET (set, i)) {
@@ -90,6 +91,55 @@ UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,
9091 return bits;
9192}
9293
94+ #define UCX_KERNEL_CMD (level, cmd, blocks, threads, shared_size, func, ...) \
95+ do { \
96+ switch (cmd) { \
97+ case UCX_PERF_CMD_PUT_SINGLE: \
98+ func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
99+ break ; \
100+ case UCX_PERF_CMD_PUT_MULTI: \
101+ func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
102+ break ; \
103+ case UCX_PERF_CMD_PUT_PARTIAL: \
104+ func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
105+ break ; \
106+ default : \
107+ ucs_error (" Unsupported cmd: %d" , cmd); \
108+ break ; \
109+ } \
110+ } while (0 )
111+
112+ #define UCX_KERNEL_DISPATCH (perf, func, ...) \
113+ do { \
114+ ucs_device_level_t _level = perf.params .device_level ; \
115+ ucx_perf_cmd_t _cmd = perf.params .command ; \
116+ unsigned _blocks = perf.params .device_block_count ; \
117+ unsigned _threads = perf.params .device_thread_count ; \
118+ size_t _shared_size = _threads * perf.params .max_outstanding * \
119+ sizeof (ucp_device_request_t ); \
120+ switch (_level) { \
121+ case UCS_DEVICE_LEVEL_THREAD: \
122+ UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
123+ _shared_size, func, __VA_ARGS__); \
124+ break ; \
125+ case UCS_DEVICE_LEVEL_WARP: \
126+ UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
127+ _shared_size, func, __VA_ARGS__); \
128+ break ; \
129+ case UCS_DEVICE_LEVEL_BLOCK: \
130+ UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
131+ _shared_size, func, __VA_ARGS__); \
132+ break ; \
133+ case UCS_DEVICE_LEVEL_GRID: \
134+ UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
135+ _shared_size, func, __VA_ARGS__); \
136+ break ; \
137+ default : \
138+ ucs_error (" Unsupported level: %d" , _level); \
139+ break ; \
140+ } \
141+ } while (0 )
142+
93143class ucx_perf_cuda_test_runner {
94144public:
95145 ucx_perf_cuda_test_runner (ucx_perf_context_t &perf) : m_perf(perf)
@@ -110,17 +160,17 @@ public:
110160 CUDA_CALL_WARN (cudaFreeHost, m_cpu_ctx);
111161 }
112162
113- ucx_perf_cuda_context &gpu_ctx () const { return *m_gpu_ctx; }
114-
115- void wait_for_kernel (size_t msg_length)
163+ void wait_for_kernel ()
116164 {
165+ size_t msg_length = ucx_perf_get_message_size (&m_perf.params );
117166 ucx_perf_counter_t last_completed = 0 ;
118167 ucx_perf_counter_t completed = m_cpu_ctx->completed_iters ;
119- while (1 ) {
168+ unsigned thread_count = m_perf.params .device_thread_count ;
169+ while (true ) {
120170 ucx_perf_counter_t delta = completed - last_completed;
121171 if (delta > 0 ) {
122172 // TODO: calculate latency percentile on kernel
123- ucx_perf_update (&m_perf, delta, msg_length);
173+ ucx_perf_update (&m_perf, delta, delta * thread_count, msg_length);
124174 } else if (completed >= m_perf.max_iter ) {
125175 break ;
126176 }
@@ -133,6 +183,8 @@ public:
133183
134184protected:
135185 ucx_perf_context_t &m_perf;
186+ ucx_perf_cuda_context *m_cpu_ctx;
187+ ucx_perf_cuda_context *m_gpu_ctx;
136188
137189private:
138190 void init_ctx ()
@@ -142,17 +194,16 @@ private:
142194 CUDA_CALL (, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer,
143195 &m_gpu_ctx, m_cpu_ctx, 0 );
144196 }
145-
146- ucx_perf_cuda_context *m_cpu_ctx;
147- ucx_perf_cuda_context *m_gpu_ctx;
148197};
149198
150199
151200template <typename Runner> ucs_status_t
152201ucx_perf_cuda_dispatch (ucx_perf_context_t *perf)
153202{
154203 Runner runner (*perf);
155- if (perf->params .command == UCX_PERF_CMD_PUT_MULTI) {
204+ if ((perf->params .command == UCX_PERF_CMD_PUT_MULTI) ||
205+ (perf->params .command == UCX_PERF_CMD_PUT_SINGLE) ||
206+ (perf->params .command == UCX_PERF_CMD_PUT_PARTIAL)) {
156207 if (perf->params .test_type == UCX_PERF_TEST_TYPE_PINGPONG) {
157208 return runner.run_pingpong ();
158209 } else if (perf->params .test_type == UCX_PERF_TEST_TYPE_STREAM_UNI) {
0 commit comments