@@ -18,6 +18,7 @@ typedef unsigned long long ucx_perf_cuda_time_t;
1818
1919struct ucx_perf_cuda_context {
2020 unsigned max_outstanding;
21+ unsigned device_fc_window;
2122 ucx_perf_counter_t max_iters;
2223 ucx_perf_cuda_time_t report_interval_ns;
2324 ucx_perf_counter_t completed_iters;
@@ -32,22 +33,46 @@ UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
3233 return globaltimer;
3334}
3435
35- UCS_F_DEVICE void
36- ucx_perf_cuda_update_report (ucx_perf_cuda_context &ctx,
37- ucx_perf_counter_t completed,
38- ucx_perf_counter_t max_iters,
39- ucx_perf_cuda_time_t &last_report_time)
40- {
41- if (threadIdx .x == 0 ) {
42- ucx_perf_cuda_time_t current_time = ucx_perf_cuda_get_time_ns ();
43- if (((current_time - last_report_time) >= ctx.report_interval_ns ) ||
44- (completed >= max_iters)) {
45- ctx.completed_iters = completed;
46- last_report_time = current_time;
47- __threadfence ();
36+ class ucx_perf_cuda_reporter {
37+ public:
38+ /* Number of updates per report interval */
39+ static const unsigned UPDATES_PER_INTERVAL = 5 ;
40+
41+ __device__
42+ ucx_perf_cuda_reporter (ucx_perf_cuda_context &ctx) :
43+ m_ctx (ctx),
44+ m_max_iters (ctx.max_iters),
45+ m_next_report_iter (1 ),
46+ m_last_report_time (ucx_perf_cuda_get_time_ns()),
47+ m_report_interval_ns (ctx.report_interval_ns / UPDATES_PER_INTERVAL)
48+ {
49+ }
50+
51+ __device__ inline void
52+ update_report (ucx_perf_counter_t completed)
53+ {
54+ if ((threadIdx .x == 0 ) && ucs_unlikely (completed >= m_next_report_iter)) {
55+ assert (completed - m_ctx.completed_iters > 0 );
56+ ucx_perf_cuda_time_t cur_time = ucx_perf_cuda_get_time_ns ();
57+ ucx_perf_cuda_time_t iter_time = (cur_time - m_last_report_time) /
58+ (completed - m_ctx.completed_iters );
59+ assert (iter_time > 0 );
60+ m_last_report_time = cur_time;
61+ m_ctx.completed_iters = completed;
62+ __threadfence_system ();
63+
64+ m_next_report_iter = ucs_min (completed + (m_report_interval_ns / iter_time),
65+ m_max_iters);
4866 }
4967 }
50- }
68+
69+ private:
70+ ucx_perf_cuda_context &m_ctx;
71+ ucx_perf_counter_t m_max_iters;
72+ ucx_perf_counter_t m_next_report_iter;
73+ ucx_perf_cuda_time_t m_last_report_time;
74+ ucx_perf_cuda_time_t m_report_interval_ns;
75+ };
5176
5277static UCS_F_ALWAYS_INLINE uint64_t *
5378ucx_perf_cuda_get_sn (const void *address, size_t length)
@@ -63,95 +88,83 @@ UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
6388 __syncthreads ();
6489}
6590
66- /* Simple bitset */
67- #define UCX_BIT_MASK (bit ) (1 << ((bit) & (CHAR_BIT - 1 )))
68- #define UCX_BIT_SET (set, bit ) (set[(bit)/CHAR_BIT] |= UCX_BIT_MASK(bit))
69- #define UCX_BIT_RESET (set, bit ) (set[(bit)/CHAR_BIT] &= ~UCX_BIT_MASK (bit))
70- #define UCX_BIT_GET (set, bit ) (set[(bit)/CHAR_BIT] & UCX_BIT_MASK (bit))
71- #define UCX_BITSET_SIZE (bits ) ((bits + CHAR_BIT - 1 ) / CHAR_BIT)
72-
73- UCS_F_DEVICE size_t ucx_bitset_popcount (const uint8_t *set, size_t bits) {
74- size_t count = 0 ;
75- for (size_t i = 0 ; i < bits; i++) {
76- if (UCX_BIT_GET (set, i)) {
77- count++;
78- }
79- }
80- return count;
81- }
82-
83- UCS_F_DEVICE size_t
84- ucx_bitset_ffns (const uint8_t *set, size_t bits, size_t from)
91+ template <ucs_device_level_t level>
92+ __host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index (size_t tid)
8593{
86- for ( size_t i = from; i < bits; i++ ) {
87- if (! UCX_BIT_GET (set, i)) {
88- return i ;
89- }
94+ switch (level ) {
95+ case UCS_DEVICE_LEVEL_THREAD: return tid;
96+ case UCS_DEVICE_LEVEL_WARP: return tid / UCS_DEVICE_NUM_THREADS_IN_WARP ;
97+ default : return 0 ;
9098 }
91- return bits;
9299}
93100
94- #define UCX_KERNEL_CMD (level, cmd, blocks, threads, shared_size, func, ...) \
95- do { \
96- switch (cmd) { \
97- case UCX_PERF_CMD_PUT_SINGLE: \
98- func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
99- break ; \
100- case UCX_PERF_CMD_PUT_MULTI: \
101- func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
102- break ; \
103- case UCX_PERF_CMD_PUT_PARTIAL: \
104- func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
105- break ; \
106- default : \
107- ucs_error (" Unsupported cmd: %d" , cmd); \
108- break ; \
109- } \
110- } while (0 )
101+ #define UCX_PERF_THREAD_INDEX_SET (_level, _tid, _outval ) \
102+ (_outval) = ucx_perf_cuda_thread_index<_level>(_tid)
103+
104+ #define UCX_PERF_SWITCH_CMD (_cmd, _func, ...) \
105+ switch (_cmd) { \
106+ case UCX_PERF_CMD_PUT_SINGLE: \
107+ _func (UCX_PERF_CMD_PUT_SINGLE, __VA_ARGS__); \
108+ break ; \
109+ case UCX_PERF_CMD_PUT_MULTI: \
110+ _func (UCX_PERF_CMD_PUT_MULTI, __VA_ARGS__); \
111+ break ; \
112+ case UCX_PERF_CMD_PUT_PARTIAL: \
113+ _func (UCX_PERF_CMD_PUT_PARTIAL, __VA_ARGS__); \
114+ break ; \
115+ default : \
116+ ucs_error (" Unsupported cmd: %d" , _cmd); \
117+ break ; \
118+ }
111119
112- #define UCX_KERNEL_DISPATCH (perf, func, ...) \
113- do { \
114- ucs_device_level_t _level = perf.params .device_level ; \
115- ucx_perf_cmd_t _cmd = perf.params .command ; \
116- unsigned _blocks = perf.params .device_block_count ; \
117- unsigned _threads = perf.params .device_thread_count ; \
118- size_t _shared_size = _threads * perf.params .max_outstanding * \
119- sizeof (ucp_device_request_t ); \
120- switch (_level) { \
120+ #define UCX_PERF_SWITCH_LEVEL (_level, _func, ...) \
121+ switch (_level) { \
121122 case UCS_DEVICE_LEVEL_THREAD: \
122- UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
123- _shared_size, func, __VA_ARGS__); \
123+ _func (UCS_DEVICE_LEVEL_THREAD, __VA_ARGS__); \
124124 break ; \
125125 case UCS_DEVICE_LEVEL_WARP: \
126- UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
127- _shared_size, func, __VA_ARGS__); \
126+ _func (UCS_DEVICE_LEVEL_WARP, __VA_ARGS__); \
128127 break ; \
129128 case UCS_DEVICE_LEVEL_BLOCK: \
130- UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
131- _shared_size, func, __VA_ARGS__); \
132- break ; \
133129 case UCS_DEVICE_LEVEL_GRID: \
134- UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
135- _shared_size, func, __VA_ARGS__); \
136- break ; \
137130 default : \
138131 ucs_error (" Unsupported level: %d" , _level); \
139132 break ; \
140- } \
133+ }
134+
135+ #define UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL (_cmd, _level, _perf, _kernel, ...) \
136+ do { \
137+ unsigned _blocks = _perf.params .device_block_count ; \
138+ unsigned _threads = _perf.params .device_thread_count ; \
139+ unsigned _reqs_count = ucs_div_round_up (_perf.params .max_outstanding , \
140+ _perf.params .device_fc_window ); \
141+ size_t _shared_size = _reqs_count * sizeof (ucp_device_request_t ) * \
142+ ucx_perf_cuda_thread_index<_level>(_threads); \
143+ _kernel<_level, _cmd><<<_blocks, _threads, _shared_size>>> (__VA_ARGS__); \
141144 } while (0 )
142145
146+ #define UCX_PERF_KERNEL_DISPATCH_CMD (_level, _perf, _kernel, ...) \
147+ UCX_PERF_SWITCH_CMD (_perf.params.command, UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL, \
148+ _level, _perf, _kernel, __VA_ARGS__);
149+
150+ #define UCX_PERF_KERNEL_DISPATCH (_perf, _kernel, ...) \
151+ UCX_PERF_SWITCH_LEVEL (_perf.params.device_level, UCX_PERF_KERNEL_DISPATCH_CMD, \
152+ _perf, _kernel, __VA_ARGS__);
153+
154+
143155class ucx_perf_cuda_test_runner {
144156public:
145157 ucx_perf_cuda_test_runner (ucx_perf_context_t &perf) : m_perf(perf)
146158 {
147159 init_ctx ();
148160
149161 m_cpu_ctx->max_outstanding = perf.params .max_outstanding ;
162+ m_cpu_ctx->device_fc_window = perf.params .device_fc_window ;
150163 m_cpu_ctx->max_iters = perf.max_iter ;
151164 m_cpu_ctx->completed_iters = 0 ;
152165 m_cpu_ctx->report_interval_ns = (perf.report_interval == ULONG_MAX) ?
153166 ULONG_MAX :
154- ucs_time_to_nsec (perf.report_interval ) / 100 ;
167+ ucs_time_to_nsec (perf.report_interval );
155168 m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
156169 }
157170
@@ -166,11 +179,16 @@ public:
166179 ucx_perf_counter_t last_completed = 0 ;
167180 ucx_perf_counter_t completed = m_cpu_ctx->completed_iters ;
168181 unsigned thread_count = m_perf.params .device_thread_count ;
182+ ucs_device_level_t level = m_perf.params .device_level ;
183+ unsigned msgs_per_iter;
184+ UCX_PERF_SWITCH_LEVEL (level, UCX_PERF_THREAD_INDEX_SET, thread_count,
185+ msgs_per_iter);
186+
169187 while (true ) {
170188 ucx_perf_counter_t delta = completed - last_completed;
171189 if (delta > 0 ) {
172190 // TODO: calculate latency percentile on kernel
173- ucx_perf_update (&m_perf, delta, delta * thread_count , msg_length);
191+ ucx_perf_update (&m_perf, delta, delta * msgs_per_iter , msg_length);
174192 } else if (completed >= m_perf.max_iter ) {
175193 break ;
176194 }
0 commit comments