@@ -18,6 +18,7 @@ typedef unsigned long long ucx_perf_cuda_time_t;
1818
1919struct ucx_perf_cuda_context {
2020 unsigned max_outstanding;
21+ unsigned device_fc_window;
2122 ucx_perf_counter_t max_iters;
2223 ucx_perf_cuda_time_t report_interval_ns;
2324 ucx_perf_counter_t completed_iters;
@@ -32,22 +33,41 @@ UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
3233 return globaltimer;
3334}
3435
35- UCS_F_DEVICE void
36- ucx_perf_cuda_update_report (ucx_perf_cuda_context &ctx,
37- ucx_perf_counter_t completed,
38- ucx_perf_counter_t max_iters,
39- ucx_perf_cuda_time_t &last_report_time)
40- {
41- if (threadIdx .x == 0 ) {
42- ucx_perf_cuda_time_t current_time = ucx_perf_cuda_get_time_ns ();
43- if (((current_time - last_report_time) >= ctx.report_interval_ns ) ||
44- (completed >= max_iters)) {
45- ctx.completed_iters = completed;
46- last_report_time = current_time;
47- __threadfence ();
36+ class ucx_perf_cuda_reporter {
37+ public:
38+ __device__
39+ ucx_perf_cuda_reporter (ucx_perf_cuda_context &ctx) :
40+ m_ctx (ctx),
41+ m_max_iters (ctx.max_iters),
42+ m_next_report_iter (1 ),
43+ m_last_report_time (ucx_perf_cuda_get_time_ns()),
44+ m_report_interval_ns (ctx.report_interval_ns / 5 )
45+ {
46+ }
47+
48+ __device__ inline void
49+ update_report (ucx_perf_counter_t completed)
50+ {
51+ if ((threadIdx .x == 0 ) && ucs_unlikely (completed >= m_next_report_iter)) {
52+ ucx_perf_cuda_time_t cur_time = ucx_perf_cuda_get_time_ns ();
53+ ucx_perf_cuda_time_t iter_time = (cur_time - m_last_report_time) /
54+ (completed - m_ctx.completed_iters );
55+ m_last_report_time = cur_time;
56+ m_ctx.completed_iters = completed;
57+ __threadfence_system ();
58+
59+ m_next_report_iter = ucs_min (completed + (m_report_interval_ns / iter_time),
60+ m_max_iters);
4861 }
4962 }
50- }
63+
64+ private:
65+ ucx_perf_cuda_context &m_ctx;
66+ ucx_perf_counter_t m_max_iters;
67+ ucx_perf_counter_t m_next_report_iter;
68+ ucx_perf_cuda_time_t m_last_report_time;
69+ ucx_perf_cuda_time_t m_report_interval_ns;
70+ };
5171
5272static UCS_F_ALWAYS_INLINE uint64_t *
5373ucx_perf_cuda_get_sn (const void *address, size_t length)
@@ -63,95 +83,83 @@ UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
6383 __syncthreads ();
6484}
6585
66- /* Simple bitset */
67- #define UCX_BIT_MASK (bit ) (1 << ((bit) & (CHAR_BIT - 1 )))
68- #define UCX_BIT_SET (set, bit ) (set[(bit)/CHAR_BIT] |= UCX_BIT_MASK(bit))
69- #define UCX_BIT_RESET (set, bit ) (set[(bit)/CHAR_BIT] &= ~UCX_BIT_MASK (bit))
70- #define UCX_BIT_GET (set, bit ) (set[(bit)/CHAR_BIT] & UCX_BIT_MASK (bit))
71- #define UCX_BITSET_SIZE (bits ) ((bits + CHAR_BIT - 1 ) / CHAR_BIT)
72-
73- UCS_F_DEVICE size_t ucx_bitset_popcount (const uint8_t *set, size_t bits) {
74- size_t count = 0 ;
75- for (size_t i = 0 ; i < bits; i++) {
76- if (UCX_BIT_GET (set, i)) {
77- count++;
78- }
79- }
80- return count;
81- }
82-
83- UCS_F_DEVICE size_t
84- ucx_bitset_ffns (const uint8_t *set, size_t bits, size_t from)
86+ template <ucs_device_level_t level>
87+ __host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index (size_t tid)
8588{
86- for ( size_t i = from; i < bits; i++ ) {
87- if (! UCX_BIT_GET (set, i)) {
88- return i ;
89- }
89+ switch (level ) {
90+ case UCS_DEVICE_LEVEL_THREAD: return tid;
91+ case UCS_DEVICE_LEVEL_WARP: return tid / UCS_DEVICE_NUM_THREADS_IN_WARP ;
92+ default : return 0 ;
9093 }
91- return bits;
9294}
9395
94- #define UCX_KERNEL_CMD (level, cmd, blocks, threads, shared_size, func, ...) \
95- do { \
96- switch (cmd) { \
97- case UCX_PERF_CMD_PUT_SINGLE: \
98- func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
99- break ; \
100- case UCX_PERF_CMD_PUT_MULTI: \
101- func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
102- break ; \
103- case UCX_PERF_CMD_PUT_PARTIAL: \
104- func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
105- break ; \
106- default : \
107- ucs_error (" Unsupported cmd: %d" , cmd); \
108- break ; \
109- } \
110- } while (0 )
96+ #define UCX_PERF_THREAD_INDEX_SET (_level, _tid, _outval ) \
97+ (_outval) = ucx_perf_cuda_thread_index<_level>(_tid)
98+
99+ #define UCX_PERF_SWITCH_CMD (_cmd, _func, ...) \
100+ switch (_cmd) { \
101+ case UCX_PERF_CMD_PUT_SINGLE: \
102+ _func (UCX_PERF_CMD_PUT_SINGLE, __VA_ARGS__); \
103+ break ; \
104+ case UCX_PERF_CMD_PUT_MULTI: \
105+ _func (UCX_PERF_CMD_PUT_MULTI, __VA_ARGS__); \
106+ break ; \
107+ case UCX_PERF_CMD_PUT_PARTIAL: \
108+ _func (UCX_PERF_CMD_PUT_PARTIAL, __VA_ARGS__); \
109+ break ; \
110+ default : \
111+ ucs_error (" Unsupported cmd: %d" , _cmd); \
112+ break ; \
113+ }
111114
112- #define UCX_KERNEL_DISPATCH (perf, func, ...) \
113- do { \
114- ucs_device_level_t _level = perf.params .device_level ; \
115- ucx_perf_cmd_t _cmd = perf.params .command ; \
116- unsigned _blocks = perf.params .device_block_count ; \
117- unsigned _threads = perf.params .device_thread_count ; \
118- size_t _shared_size = _threads * perf.params .max_outstanding * \
119- sizeof (ucp_device_request_t ); \
120- switch (_level) { \
115+ #define UCX_PERF_SWITCH_LEVEL (_level, _func, ...) \
116+ switch (_level) { \
121117 case UCS_DEVICE_LEVEL_THREAD: \
122- UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
123- _shared_size, func, __VA_ARGS__); \
118+ _func (UCS_DEVICE_LEVEL_THREAD, __VA_ARGS__); \
124119 break ; \
125120 case UCS_DEVICE_LEVEL_WARP: \
126- UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
127- _shared_size, func, __VA_ARGS__); \
121+ _func (UCS_DEVICE_LEVEL_WARP, __VA_ARGS__); \
128122 break ; \
129123 case UCS_DEVICE_LEVEL_BLOCK: \
130- UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
131- _shared_size, func, __VA_ARGS__); \
132- break ; \
133124 case UCS_DEVICE_LEVEL_GRID: \
134- UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
135- _shared_size, func, __VA_ARGS__); \
136- break ; \
137125 default : \
138126 ucs_error (" Unsupported level: %d" , _level); \
139127 break ; \
140- } \
128+ }
129+
130+ #define UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL (_cmd, _level, _perf, _kernel, ...) \
131+ do { \
132+ unsigned _blocks = _perf.params .device_block_count ; \
133+ unsigned _threads = _perf.params .device_thread_count ; \
134+ unsigned _reqs_count = ucs_div_round_up (_perf.params .max_outstanding , \
135+ _perf.params .device_fc_window ); \
136+ size_t _shared_size = _reqs_count * sizeof (ucp_device_request_t ) * \
137+ ucx_perf_cuda_thread_index<_level>(_threads); \
138+ _kernel<_level, _cmd><<<_blocks, _threads, _shared_size>>> (__VA_ARGS__); \
141139 } while (0 )
142140
141+ #define UCX_PERF_KERNEL_DISPATCH_CMD (_level, _perf, _kernel, ...) \
142+ UCX_PERF_SWITCH_CMD (_perf.params.command, UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL, \
143+ _level, _perf, _kernel, __VA_ARGS__);
144+
145+ #define UCX_PERF_KERNEL_DISPATCH (_perf, _kernel, ...) \
146+ UCX_PERF_SWITCH_LEVEL (_perf.params.device_level, UCX_PERF_KERNEL_DISPATCH_CMD, \
147+ _perf, _kernel, __VA_ARGS__);
148+
149+
143150class ucx_perf_cuda_test_runner {
144151public:
145152 ucx_perf_cuda_test_runner (ucx_perf_context_t &perf) : m_perf(perf)
146153 {
147154 init_ctx ();
148155
149156 m_cpu_ctx->max_outstanding = perf.params .max_outstanding ;
157+ m_cpu_ctx->device_fc_window = perf.params .device_fc_window ;
150158 m_cpu_ctx->max_iters = perf.max_iter ;
151159 m_cpu_ctx->completed_iters = 0 ;
152160 m_cpu_ctx->report_interval_ns = (perf.report_interval == ULONG_MAX) ?
153161 ULONG_MAX :
154- ucs_time_to_nsec (perf.report_interval ) / 100 ;
162+ ucs_time_to_nsec (perf.report_interval );
155163 m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
156164 }
157165
@@ -166,11 +174,16 @@ public:
166174 ucx_perf_counter_t last_completed = 0 ;
167175 ucx_perf_counter_t completed = m_cpu_ctx->completed_iters ;
168176 unsigned thread_count = m_perf.params .device_thread_count ;
177+ ucs_device_level_t level = m_perf.params .device_level ;
178+ unsigned msgs_per_iter;
179+ UCX_PERF_SWITCH_LEVEL (level, UCX_PERF_THREAD_INDEX_SET, thread_count,
180+ msgs_per_iter);
181+
169182 while (true ) {
170183 ucx_perf_counter_t delta = completed - last_completed;
171184 if (delta > 0 ) {
172185 // TODO: calculate latency percentile on kernel
173- ucx_perf_update (&m_perf, delta, delta * thread_count , msg_length);
186+ ucx_perf_update (&m_perf, delta, delta * msgs_per_iter , msg_length);
174187 } else if (completed >= m_perf.max_iter ) {
175188 break ;
176189 }
0 commit comments