@@ -109,16 +109,6 @@ private:
109109 ucp_device_request_t *m_ptr;
110110};
111111
112- template <typename Func>
113- class scope_guard {
114- public:
115- __device__ scope_guard (Func& func) : m_func(func) {}
116- __device__ ~scope_guard () { m_func (); }
117-
118- private:
119- Func& m_func;
120- };
121-
122112UCS_F_DEVICE ucs_status_t
123113ucp_test_kernel_get_state (const test_ucp_device_kernel_params_t ¶ms,
124114 test_ucp_device_kernel_result_t &result)
@@ -151,12 +141,10 @@ ucp_test_kernel_get_state(const test_ucp_device_kernel_params_t ¶ms,
151141}
152142
153143template <ucs_device_level_t level>
154- static __global__ void
155- ucp_test_kernel (const test_ucp_device_kernel_params_t params,
156- test_ucp_device_kernel_result_t *result_ptr)
144+ UCS_F_DEVICE void
145+ ucp_test_kernel_job (const test_ucp_device_kernel_params_t & params,
146+ test_ucp_device_kernel_result_t *result_ptr)
157147{
158- /* Execute fence on any return, to ensure result is visible to the host */
159- scope_guard fence (__threadfence_system );
160148 ucs_status_t &status = result_ptr->status ;
161149
162150 if (blockDim .x > device_request<level>::MAX_THREADS) {
@@ -197,6 +185,16 @@ ucp_test_kernel(const test_ucp_device_kernel_params_t params,
197185 status = ucp_test_kernel_get_state (params, *result_ptr);
198186}
199187
188+ template <ucs_device_level_t level>
189+ static __global__ void
190+ ucp_test_kernel (const test_ucp_device_kernel_params_t params,
191+ test_ucp_device_kernel_result_t *result_ptr)
192+ {
193+ ucp_test_kernel_job<level>(params, result_ptr);
194+ /* Execute fence on any return, to ensure result is visible to the host */
195+ __threadfence_system ();
196+ }
197+
200198static ucs_status_t check_warp_size ()
201199{
202200 CUdevice cuda_device;
0 commit comments