Skip to content

Commit bf7dedc

Browse files
authored
Merge pull request #15545 from sneaxiy/fix_debug_nccl_error
Fix nccl unittest error in debug mode
2 parents a6910f9 + ba4f43f commit bf7dedc

File tree

4 files changed

+45
-98
lines changed

4 files changed

+45
-98
lines changed

paddle/fluid/operators/distributed/proto_encoder_helper.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class ProtoEncodeHelper {
8585
#define REPLACE_ENFORCE_GLOG 1
8686
// Make sure callers didn't do operations that went over max_size promised
8787
if (paddle::platform::is_error(p_ <= limit_)) {
88-
paddle::platform::throw_on_error(p_ <= limit_);
88+
paddle::platform::throw_on_error(p_ <= limit_, "");
8989
}
9090
#undef REPLACE_ENFORCE_GLOG
9191
}

paddle/fluid/platform/enforce.h

Lines changed: 41 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,8 @@ struct EnforceNotMet : public std::exception {
7171
}
7272
}
7373

74-
template <typename... ARGS>
75-
EnforceNotMet(const char* f, int l, ARGS... args) {
76-
Init(string::Sprintf(args...), f, l);
74+
EnforceNotMet(const std::string& str, const char* f, int l) {
75+
Init(str, f, l);
7776
}
7877

7978
const char* what() const noexcept override { return err_str_.c_str(); }
@@ -142,68 +141,56 @@ struct EOFException : public std::exception {
142141

143142
inline bool is_error(bool stat) { return !stat; }
144143

145-
template <typename... Args>
146-
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
147-
bool stat, const Args&... args) {
144+
inline void throw_on_error(bool stat, const std::string& msg) {
148145
#ifndef REPLACE_ENFORCE_GLOG
149-
throw std::runtime_error(string::Sprintf(args...));
146+
throw std::runtime_error(msg);
150147
#else
151-
LOG(FATAL) << string::Sprintf(args...);
148+
LOG(FATAL) << msg;
152149
#endif
153150
}
154151

155152
#ifdef PADDLE_WITH_CUDA
156153

157-
inline bool is_error(cudaError_t e) { return UNLIKELY(e); }
154+
inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
158155

159-
template <typename... Args>
160-
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
161-
cudaError_t e, const Args&... args) {
156+
inline void throw_on_error(cudaError_t e, const std::string& msg) {
162157
#ifndef REPLACE_ENFORCE_GLOG
163-
throw thrust::system_error(e, thrust::cuda_category(),
164-
string::Sprintf(args...));
158+
throw thrust::system_error(e, thrust::cuda_category(), msg);
165159
#else
166-
LOG(FATAL) << string::Sprintf(args...);
160+
LOG(FATAL) << msg;
167161
#endif
168162
}
169163

170164
inline bool is_error(curandStatus_t stat) {
171165
return stat != CURAND_STATUS_SUCCESS;
172166
}
173167

174-
template <typename... Args>
175-
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
176-
curandStatus_t stat, const Args&... args) {
168+
inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
177169
#ifndef REPLACE_ENFORCE_GLOG
178170
throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
179-
string::Sprintf(args...));
171+
msg);
180172
#else
181-
LOG(FATAL) << string::Sprintf(args...);
173+
LOG(FATAL) << msg;
182174
#endif
183175
}
184176

185177
inline bool is_error(cudnnStatus_t stat) {
186178
return stat != CUDNN_STATUS_SUCCESS;
187179
}
188180

189-
template <typename... Args>
190-
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
191-
cudnnStatus_t stat, const Args&... args) {
181+
inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
192182
#ifndef REPLACE_ENFORCE_GLOG
193-
throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
194-
string::Sprintf(args...));
183+
throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + msg);
195184
#else
196-
LOG(FATAL) << string::Sprintf(args...);
185+
LOG(FATAL) << platform::dynload::cudnnGetErrorString(stat) << msg;
197186
#endif
198187
}
199188

200189
inline bool is_error(cublasStatus_t stat) {
201190
return stat != CUBLAS_STATUS_SUCCESS;
202191
}
203192

204-
template <typename... Args>
205-
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
206-
cublasStatus_t stat, const Args&... args) {
193+
inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
207194
std::string err;
208195
if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
209196
err = "CUBLAS: not initialized, ";
@@ -225,87 +212,45 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
225212
err = "CUBLAS: license error, ";
226213
}
227214
#ifndef REPLACE_ENFORCE_GLOG
228-
throw std::runtime_error(err + string::Sprintf(args...));
215+
throw std::runtime_error(err + msg);
229216
#else
230-
LOG(FATAL) << err << string::Sprintf(args...);
217+
LOG(FATAL) << err << msg;
231218
#endif
232219
}
233220

234221
#if !defined(__APPLE__) && !defined(_WIN32)
235-
template <typename... Args>
236-
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
237-
ncclResult_t stat, const Args&... args) {
238-
if (stat == ncclSuccess) {
239-
return;
240-
} else {
222+
inline bool is_error(ncclResult_t nccl_result) {
223+
return nccl_result != ncclSuccess;
224+
}
225+
226+
inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
241227
#ifndef REPLACE_ENFORCE_GLOG
242-
throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
243-
string::Sprintf(args...));
228+
throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + msg);
244229
#else
245-
LOG(FATAL) << platform::dynload::ncclGetErrorString(stat)
246-
<< string::Sprintf(args...);
230+
LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) << msg;
247231
#endif
248-
}
249232
}
250233
#endif // __APPLE__ and windows
251234
#endif // PADDLE_WITH_CUDA
252235

253-
template <typename T>
254-
inline void throw_on_error(T e) {
255-
throw_on_error(e, "");
256-
}
257-
258-
#define PADDLE_THROW(...) \
259-
throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
260-
261-
#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
262-
263-
#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \
264-
::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG));
265-
266-
#ifdef _WIN32
267-
#define __PADDLE_THROW_ON_ERROR(COND, ...) \
268-
__THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)
269-
#else // _WIN32
270-
#define __PADDLE_THROW_ON_ERROR(COND, ...) \
271-
__PADDLE_THROW_ERROR_I( \
272-
__VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
273-
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
274-
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
275-
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
276-
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
277-
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
278-
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
279-
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
280-
__THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__))
281-
#endif // _WIN32
282-
283-
#define __PADDLE_UNARY_COMPARE(COND, ...) \
284-
do { \
285-
auto __cond = COND; \
286-
if (UNLIKELY(::paddle::platform::is_error(__cond))) { \
287-
__PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \
288-
} \
236+
#define PADDLE_THROW(...) \
237+
throw ::paddle::platform::EnforceNotMet( \
238+
::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__)
239+
240+
#define PADDLE_ENFORCE(COND, ...) \
241+
do { \
242+
auto __cond__ = (COND); \
243+
if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \
244+
try { \
245+
::paddle::platform::throw_on_error( \
246+
__cond__, ::paddle::string::Sprintf(__VA_ARGS__)); \
247+
} catch (...) { \
248+
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
249+
__FILE__, __LINE__); \
250+
} \
251+
} \
289252
} while (0)
290253

291-
#ifndef REPLACE_ENFORCE_GLOG
292-
#define __PADDLE_ENFORCE_I(COND, ...) \
293-
do { \
294-
try { \
295-
__PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \
296-
} catch (...) { \
297-
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
298-
__FILE__, __LINE__); \
299-
} \
300-
} while (0)
301-
302-
#else
303-
#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);
304-
#endif // REPLACE_ENFORCE_GLOG
305-
306-
#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args
307-
#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__))
308-
309254
#define PADDLE_THROW_EOF() \
310255
do { \
311256
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \

paddle/fluid/platform/nccl_helper.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ class NCCLGroupGuard {
6464
}
6565

6666
inline ~NCCLGroupGuard() {
67-
CHECK_EQ(dynload::ncclGroupEnd(), ncclSuccess);
67+
PADDLE_ENFORCE(dynload::ncclGroupEnd());
6868
NCCLMutex().unlock();
6969
}
7070
};

paddle/fluid/string/printf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
8484
tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
8585
}
8686

87+
inline std::string Sprintf() { return ""; }
88+
8789
template <typename... Args>
8890
std::string Sprintf(const Args&... args) {
8991
std::ostringstream oss;

0 commit comments

Comments
 (0)