Skip to content

Commit 48d5734

Browse files
authored
TEST/DEVICE: Print kernel launch failure instead of exception (#10923)
Throwing an exception triggers cleanup flow that also calls cudaFree(), so the error is reported from cudaFree() instead of the actual place it happened.
1 parent 6380bca commit 48d5734

File tree

2 files changed

+13
-9
lines changed

2 files changed

+13
-9
lines changed

test/gtest/common/cuda.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include <sstream>
1313
#include <memory>
1414
#include <vector>
15-
15+
#include <ucs/debug/log.h>
1616

1717
namespace ucx_cuda {
1818

@@ -97,21 +97,21 @@ device_vector<T> make_device_vector(const std::vector<T> &vec)
9797
return device_vector<T>(vec);
9898
}
9999

100-
static inline void synchronize()
100+
static inline ucs_status_t synchronize()
101101
{
102102
cudaError_t err = cudaGetLastError();
103103
if (err != cudaSuccess) {
104-
std::stringstream ss;
105-
ss << "kernel launch failure: " << cudaGetErrorString(err);
106-
throw std::runtime_error(ss.str());
104+
ucs_error("kernel launch failure: %s", cudaGetErrorString(err));
105+
return UCS_ERR_IO_ERROR;
107106
}
108107

109108
err = cudaDeviceSynchronize();
110109
if (err != cudaSuccess) {
111-
std::stringstream ss;
112-
ss << "cudaDeviceSynchronize(): " << cudaGetErrorString(err);
113-
throw std::runtime_error(ss.str());
110+
ucs_error("cudaDeviceSynchronize(): %s", cudaGetErrorString(err));
111+
return UCS_ERR_IO_ERROR;
114112
}
113+
114+
return UCS_OK;
115115
}
116116

117117
} // namespace ucx_cuda

test/gtest/ucp/cuda/test_kernels.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,10 @@ launch_test_ucp_device_kernel(const test_ucp_device_kernel_params_t &params)
208208
return UCS_ERR_INVALID_PARAM;
209209
}
210210

211-
ucx_cuda::synchronize();
211+
ucs_status_t sync_status = ucx_cuda::synchronize();
212+
if (sync_status != UCS_OK) {
213+
return sync_status;
214+
}
215+
212216
return *status;
213217
}

0 commit comments

Comments
 (0)