Skip to content

Commit 5582719

Browse files
authored
Optimize error message, include dgc, nccl, size op (#24456), test=release/1.8 (#24524)
1 parent f0c6101 commit 5582719

File tree

7 files changed

+145
-127
lines changed

7 files changed

+145
-127
lines changed

paddle/fluid/operators/dgc_clip_by_norm_op.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ class DGCClipByNormOp : public ClipByNormOp {
2323

2424
protected:
2525
void InferShape(framework::InferShapeContext* ctx) const override {
26-
PADDLE_ENFORCE(ctx->HasInput("current_step"),
27-
"current_step should be set.");
26+
OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
27+
"DGCClipByNormOp");
2828

2929
return ClipByNormOp::InferShape(ctx);
3030
}

paddle/fluid/operators/dgc_op.cc

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,28 +25,21 @@ class DGCOp : public framework::OperatorWithKernel {
2525
using framework::OperatorWithKernel::OperatorWithKernel;
2626

2727
void InferShape(framework::InferShapeContext* ctx) const override {
28-
PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) of DGCop should not be null.");
29-
PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null.");
30-
PADDLE_ENFORCE(ctx->HasInput("Grad"),
31-
"Input(Grad) of DGCop should not be null.");
32-
PADDLE_ENFORCE_EQ(
33-
ctx->HasInput("Param"), true,
34-
platform::errors::NotFound("Input(Param) of DGCop is not found."));
35-
PADDLE_ENFORCE(ctx->HasInput("current_step"),
36-
"Input(current_step) of DGCop should not be null.");
37-
PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
38-
"Input(nranks) of DGCop should not be null.");
39-
40-
PADDLE_ENFORCE(ctx->HasOutput("U_out"),
41-
"Output(U_out) of DGCop should not be null.");
42-
PADDLE_ENFORCE(ctx->HasOutput("V_out"),
43-
"Output(V_out) of DGCop should not be null.");
44-
PADDLE_ENFORCE(ctx->HasOutput("k"),
45-
"Output(k) of DGCop should not be null.");
46-
PADDLE_ENFORCE(ctx->HasOutput("EncodeGrad"),
47-
"Output(EncodeGrad) of DGCop should not be null.");
48-
PADDLE_ENFORCE_EQ(ctx->HasOutput("GatherBuff"), true,
49-
"Output(EncodeGrad) of DGCop should not be null.");
28+
OP_INOUT_CHECK(ctx->HasInput("U"), "Input", "U", "DGCOp");
29+
OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "DGCOp");
30+
OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DGCOp");
31+
OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "DGCOp");
32+
OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
33+
"DGCOp");
34+
OP_INOUT_CHECK(ctx->HasInput("nranks"), "Input", "nranks", "DGCOp");
35+
36+
OP_INOUT_CHECK(ctx->HasOutput("U_out"), "Output", "U_out", "DGCOp");
37+
OP_INOUT_CHECK(ctx->HasOutput("V_out"), "Output", "V_out", "DGCOp");
38+
OP_INOUT_CHECK(ctx->HasOutput("k"), "Output", "k", "DGCOp");
39+
OP_INOUT_CHECK(ctx->HasOutput("EncodeGrad"), "Output", "EncodeGrad",
40+
"DGCOp");
41+
OP_INOUT_CHECK(ctx->HasOutput("GatherBuff"), "Output", "GatherBuff",
42+
"DGCOp");
5043
}
5144

5245
protected:

paddle/fluid/operators/dgc_op.h

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,22 @@ namespace operators {
2424

2525
inline float get_period_sparcity(const std::vector<float>& sparsity,
2626
float cur_step, float rampup_steps) {
27-
PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0);
27+
PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0,
28+
platform::errors::InvalidArgument(
29+
"DGC current step=%d, but it must >= 0, "
30+
"please submit issue in github",
31+
static_cast<int>(cur_step)));
2832

2933
size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
3034
if (idx >= sparsity.size()) {
3135
idx = sparsity.size() - 1;
3236
}
3337

34-
PADDLE_ENFORCE_LT(idx, sparsity.size());
38+
PADDLE_ENFORCE_LT(
39+
idx, sparsity.size(),
40+
platform::errors::OutOfRange(
41+
"sparsity index out of bounds. idx=%d >= sparsity.size=%d", idx,
42+
sparsity.size()));
3543
return sparsity[idx];
3644
}
3745

@@ -55,7 +63,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
5563
// nranks
5664
auto nranks_tensor = ctx.Input<framework::Tensor>("nranks");
5765
const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
58-
PADDLE_ENFORCE_GT(nranks, 1, "DGC is not useful when num_trainers <= 1");
66+
PADDLE_ENFORCE_GT(nranks, 1,
67+
platform::errors::PreconditionNotMet(
68+
"DGC is not useful when num_trainers <= 1. Please "
69+
"use multi card or multi machine GPU"));
5970

6071
// regularization
6172
auto p = ctx.Input<framework::Tensor>("Param");
@@ -105,8 +116,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
105116
1 - get_period_sparcity(
106117
sparsity, static_cast<float>(*current_step - rampup_begin_step),
107118
rampup_step);
108-
PADDLE_ENFORCE_GE(ratio, 0.0);
109-
PADDLE_ENFORCE_LT(ratio, 1.0);
119+
PADDLE_ENFORCE_GE(ratio, 0.0, platform::errors::InvalidArgument(
120+
"DGC sparsity ratio must >= 0"));
121+
PADDLE_ENFORCE_LT(ratio, 1.0, platform::errors::InvalidArgument(
122+
"DGC sparsity ratio must < 1"));
110123
int k = static_cast<int>(g->numel() * ratio);
111124

112125
VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov

paddle/fluid/operators/nccl/nccl_op.cc

Lines changed: 38 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,15 @@ class NCCLInitOp : public framework::OperatorBase {
3131
private:
3232
void RunImpl(const framework::Scope &scope,
3333
const platform::Place &place) const override {
34-
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kParallelScopes)),
35-
"Can not find variable '%s' in the scope.",
36-
kParallelScopes);
34+
PADDLE_ENFORCE_NOT_NULL(
35+
scope.FindVar(Input(kParallelScopes)),
36+
platform::errors::NotFound("Can not find variable '%s' in the scope.",
37+
kParallelScopes));
3738
const auto &name = Output("Communicator");
38-
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
39-
"Can not find variable '%s' in the scope.", name);
39+
PADDLE_ENFORCE_NOT_NULL(
40+
scope.FindVar(name),
41+
platform::errors::NotFound(
42+
"Output(%s) is needed for ncclInit operator.", name));
4043
// A parallel do may not use all the gpus. For example, the batch size is 7
4144
// in the last batch while we have 8 gpu. In this case, parallel_do will
4245
// create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
@@ -46,11 +49,9 @@ class NCCLInitOp : public framework::OperatorBase {
4649
for (int i = 0; i < static_cast<int>(parallel_scopes.size()); ++i) {
4750
gpus[i] = i;
4851
}
49-
PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus.");
50-
51-
if (scope.FindVar(name) == nullptr) {
52-
PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
53-
}
52+
PADDLE_ENFORCE_EQ(!gpus.empty(), true,
53+
platform::errors::PreconditionNotMet(
54+
"gpus is empty, NCCL must init with gpus"));
5455

5556
platform::Communicator *comm =
5657
scope.FindVar(name)->GetMutable<platform::Communicator>();
@@ -92,17 +93,17 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
9293

9394
protected:
9495
void InferShape(framework::InferShapeContext *ctx) const override {
95-
PADDLE_ENFORCE(ctx->HasInput("X"),
96-
" Input(X) of AllReduce op input should not be NULL");
97-
PADDLE_ENFORCE(
98-
ctx->HasInput("Communicator"),
99-
" Input(Communicator) of AllReduce op input should not be NULL");
100-
PADDLE_ENFORCE(ctx->HasOutput("Out"),
101-
" Output(Out) of AllReduce op output should not be NULL");
96+
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLAllReduce");
97+
OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
98+
"NCCLAllReduce");
99+
100+
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLAllReduce");
101+
102102
std::string reduction = ctx->Attrs().Get<std::string>("reduction");
103-
PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
104-
reduction == "ncclMin" || reduction == "ncclMax"),
105-
"invalid reduction.");
103+
PADDLE_ENFORCE_EQ(
104+
(reduction == "ncclSum" || reduction == "ncclProd" ||
105+
reduction == "ncclMin" || reduction == "ncclMax"),
106+
true, platform::errors::InvalidArgument("invalid nccl reduction."));
106107

107108
auto x_dims = ctx->GetInputsDim("X");
108109
ctx->SetOutputsDim("Out", x_dims);
@@ -137,18 +138,17 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
137138

138139
protected:
139140
void InferShape(framework::InferShapeContext *ctx) const override {
140-
PADDLE_ENFORCE(ctx->HasInput("X"),
141-
" Input(X) of Reduce op input should not be NULL");
142-
PADDLE_ENFORCE(
143-
ctx->HasInput("Communicator"),
144-
" Input(Communicator) of Reduce op input should not be NULL");
145-
PADDLE_ENFORCE(ctx->HasOutput("Out"),
146-
" Input(X) of Reduce op input should not be NULL");
141+
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLReduce");
142+
OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
143+
"NCCLReduce");
144+
145+
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLReduce");
147146

148147
std::string reduction = ctx->Attrs().Get<std::string>("reduction");
149-
PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
150-
reduction == "ncclMin" || reduction == "ncclMax"),
151-
"invalid reduction.");
148+
PADDLE_ENFORCE_EQ(
149+
(reduction == "ncclSum" || reduction == "ncclProd" ||
150+
reduction == "ncclMin" || reduction == "ncclMax"),
151+
true, platform::errors::InvalidArgument("invalid nccl reduction."));
152152

153153
auto x_dims = ctx->GetInputsDim("X");
154154
ctx->SetOutputsDim("Out", x_dims);
@@ -188,15 +188,16 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
188188

189189
protected:
190190
void InferShape(framework::InferShapeContext *ctx) const override {
191-
PADDLE_ENFORCE(ctx->HasInput("X"),
192-
" Input(X) of Bcast op input should not be NULL");
193-
PADDLE_ENFORCE(ctx->HasInput("Communicator"),
194-
" Input(Communicator) of Bcast op input should not be NULL");
195-
PADDLE_ENFORCE(ctx->HasOutput("Out"),
196-
" Output(Out) of Bcast op output should not be NULL");
191+
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLBcast");
192+
OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
193+
"NCCLBcast");
194+
195+
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLBcast");
197196

198197
int root = ctx->Attrs().Get<int>("root");
199-
PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
198+
PADDLE_ENFORCE_EQ(
199+
root != platform::kInvalidGPUId, true,
200+
platform::errors::InvalidArgument("Bcast root must be set."));
200201

201202
auto x_dims = ctx->GetInputsDim("X");
202203
ctx->SetOutputsDim("Out", x_dims);

paddle/fluid/operators/nccl/nccl_op.cu.cc

Lines changed: 33 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
1010
limitations under the License. */
1111

1212
#include <functional>
13+
#include <unordered_map>
1314

1415
#include "paddle/fluid/framework/lod_tensor.h"
1516
#include "paddle/fluid/framework/op_registry.h"
@@ -37,36 +38,42 @@ class NCCLTypeWrapper<double> {
3738
static const ncclDataType_t type = ncclDouble;
3839
};
3940

41+
static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
42+
static const std::unordered_map<std::string, ncclRedOp_t> str_to_type = {
43+
{"ncclSum", ncclSum},
44+
{"ncclMin", ncclMin},
45+
{"ncclMax", ncclMax},
46+
{"ncclProd", ncclProd},
47+
};
48+
auto it = str_to_type.find(reduction);
49+
PADDLE_ENFORCE_EQ(it != str_to_type.end(), true,
50+
platform::errors::InvalidArgument(
51+
"Invalid nccl reduction. Must be ncclMin | ncclMax | "
52+
"ncclProd | ncclSum"));
53+
return it->second;
54+
}
55+
4056
template <typename T>
4157
class NCCLAllReduceKernel : public framework::OpKernel<T> {
4258
public:
4359
void Compute(const framework::ExecutionContext& ctx) const override {
44-
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
45-
"This kernel only runs on GPU device.");
60+
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
61+
platform::errors::PreconditionNotMet(
62+
"This kernel only runs on GPU device."));
4663
auto* x = ctx.Input<LoDTensor>("X");
4764
auto* out = ctx.Output<LoDTensor>("Out");
4865
auto* comm = ctx.Input<Communicator>("Communicator");
4966
std::string reduction = ctx.Attr<std::string>("reduction");
5067

51-
ncclRedOp_t reduction_op_ = ncclSum;
52-
if (reduction == "ncclMin") {
53-
reduction_op_ = ncclMin;
54-
} else if (reduction == "ncclMax") {
55-
reduction_op_ = ncclMax;
56-
} else if (reduction == "ncclSum") {
57-
reduction_op_ = ncclSum;
58-
} else if (reduction == "ncclProd") {
59-
reduction_op_ = ncclProd;
60-
} else {
61-
PADDLE_THROW("Invalid reduction. default ncclSum.");
62-
}
68+
auto reduction_op_ = str_to_nccl_red_type(reduction);
69+
6370
// device id
6471
int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
6572
int idx = comm->GetCommId(gpu_id);
6673
VLOG(3) << "gpu : "
6774
<< " invoke allreduce. send " << x->numel() << " recv "
6875
<< out->numel();
69-
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
76+
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
7077
x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
7178
NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
7279
ctx.cuda_device_context().stream()));
@@ -80,26 +87,17 @@ template <typename T>
8087
class NCCLReduceKernel : public framework::OpKernel<T> {
8188
public:
8289
void Compute(const framework::ExecutionContext& ctx) const override {
83-
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
84-
"This kernel only runs on GPU device.");
90+
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
91+
platform::errors::InvalidArgument(
92+
"This kernel only runs on GPU device."));
8593
auto x = ctx.Input<LoDTensor>("X"); // x0, x1, x2
8694
auto out = ctx.Output<LoDTensor>("Out");
8795
auto* comm = ctx.Input<Communicator>("Communicator");
8896
int root = ctx.Attr<int>("root");
8997
std::string reduction = ctx.Attr<std::string>("reduction");
9098

91-
ncclRedOp_t reduction_op_ = ncclSum;
92-
if (reduction == "ncclMin") {
93-
reduction_op_ = ncclMin;
94-
} else if (reduction == "ncclMax") {
95-
reduction_op_ = ncclMax;
96-
} else if (reduction == "ncclSum") {
97-
reduction_op_ = ncclSum;
98-
} else if (reduction == "ncclProd") {
99-
reduction_op_ = ncclProd;
100-
} else {
101-
PADDLE_THROW("Invalid reduction. default ncclSum.");
102-
}
99+
auto reduction_op_ = str_to_nccl_red_type(reduction);
100+
103101
// device id
104102
int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
105103
int idx = comm->GetCommId(gpu_id);
@@ -111,7 +109,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
111109
}
112110
VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
113111
<< " recv " << out->numel();
114-
PADDLE_ENFORCE(platform::dynload::ncclReduce(
112+
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
115113
x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
116114
reduction_op_, root, comm->comms().at(idx),
117115
ctx.cuda_device_context().stream()));
@@ -124,8 +122,9 @@ template <typename T>
124122
class NCCLBcastKernel : public framework::OpKernel<T> {
125123
public:
126124
void Compute(const framework::ExecutionContext& ctx) const override {
127-
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
128-
"This kernel only runs on GPU device.");
125+
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
126+
platform::errors::InvalidArgument(
127+
"This kernel only runs on GPU device."));
129128
int root = ctx.Attr<int>("root");
130129
auto* comm = ctx.Input<Communicator>("Communicator");
131130
// device id
@@ -134,7 +133,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
134133
if (idx == root) {
135134
auto* x = ctx.Input<LoDTensor>("X");
136135
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
137-
PADDLE_ENFORCE(platform::dynload::ncclBcast(
136+
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
138137
reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
139138
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
140139
ctx.cuda_device_context().stream()));
@@ -143,7 +142,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
143142
auto* out = ctx.Output<LoDTensor>("Out");
144143
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
145144
<< framework::product(out->dims());
146-
PADDLE_ENFORCE(platform::dynload::ncclBcast(
145+
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
147146
out->mutable_data<T>(ctx.GetPlace()), out->numel(),
148147
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
149148
ctx.cuda_device_context().stream()));

0 commit comments

Comments
 (0)