@@ -88,16 +88,11 @@ __host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index(size_t tid)
8888{
8989 switch (level) {
9090 case UCS_DEVICE_LEVEL_THREAD: return tid;
91- /* TODO: use UCS_DEVICE_NUM_THREADS_IN_WARP */
92- case UCS_DEVICE_LEVEL_WARP: return tid / 32 ;
91+ case UCS_DEVICE_LEVEL_WARP: return tid / UCS_DEVICE_NUM_THREADS_IN_WARP;
9392 default : return 0 ;
9493 }
9594}
9695
97- __host__ UCS_F_DEVICE unsigned ucx_ceil_div (unsigned x, unsigned y) {
98- return (x / y) + ((x % y) != 0 );
99- }
100-
10196#define UCX_PERF_THREAD_INDEX_SET (_level, _tid, _outval ) \
10297 (_outval) = ucx_perf_cuda_thread_index<_level>(_tid)
10398
@@ -136,8 +131,8 @@ __host__ UCS_F_DEVICE unsigned ucx_ceil_div(unsigned x, unsigned y) {
136131 do { \
137132 unsigned _blocks = _perf.params .device_block_count ; \
138133 unsigned _threads = _perf.params .device_thread_count ; \
139- unsigned _reqs_count = ucx_ceil_div (_perf.params .max_outstanding , \
140- _perf.params .device_fc_window ); \
134+ unsigned _reqs_count = ucs_div_round_up (_perf.params .max_outstanding , \
135+ _perf.params .device_fc_window ); \
141136 size_t _shared_size = _reqs_count * sizeof (ucp_device_request_t ) * \
142137 ucx_perf_cuda_thread_index<_level>(_threads); \
143138 _kernel<_level, _cmd><<<_blocks, _threads, _shared_size>>> (__VA_ARGS__); \
0 commit comments