Skip to content

Commit 5c3656b

Browse files
authored
add check nan / inf in downpour worker (#20694) (#20925)
* add check nan / inf in downpour worker during training * test=develop
1 parent 781d284 commit 5c3656b

File tree

6 files changed

+46
-0
lines changed

6 files changed

+46
-0
lines changed

paddle/fluid/framework/device_worker.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,8 @@ class DownpourWorker : public HogwildWorker {
230230
// adjust ins weight
231231
AdjustInsWeightConfig adjust_ins_weight_config_;
232232
std::vector<float> nid_show_;
233+
// check nan and inf during training
234+
std::vector<std::string> check_nan_var_names_;
233235
};
234236

235237
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)

paddle/fluid/framework/downpour_worker.cc

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
8181
dump_fields_[i] = desc.dump_fields(i);
8282
}
8383
adjust_ins_weight_config_ = desc.adjust_ins_weight_config();
84+
for (int i = 0; i < desc.check_nan_var_names_size(); ++i) {
85+
check_nan_var_names_.push_back(desc.check_nan_var_names(i));
86+
}
8487
}
8588

8689
void DownpourWorker::SetChannelWriter(ChannelObject<std::string>* queue) {
@@ -468,6 +471,22 @@ void DownpourWorker::TrainFilesWithProfiler() {
468471
}
469472
}
470473

474+
// check inf and nan
475+
for (std::string& var_name : check_nan_var_names_) {
476+
Variable* var = thread_scope_->FindVar(var_name);
477+
if (var == nullptr) {
478+
continue;
479+
}
480+
LoDTensor* tensor = var->GetMutable<LoDTensor>();
481+
if (tensor == nullptr) {
482+
continue;
483+
}
484+
PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor), false,
485+
"Tensor %s contains Inf", var_name);
486+
PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor), false,
487+
"Tensor %s contains NAN", var_name);
488+
}
489+
471490
if (need_to_push_sparse_) {
472491
for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
473492
++i) {
@@ -655,6 +674,22 @@ void DownpourWorker::TrainFiles() {
655674
}
656675
}
657676

677+
// check inf and nan
678+
for (std::string& var_name : check_nan_var_names_) {
679+
Variable* var = thread_scope_->FindVar(var_name);
680+
if (var == nullptr) {
681+
continue;
682+
}
683+
LoDTensor* tensor = var->GetMutable<LoDTensor>();
684+
if (tensor == nullptr) {
685+
continue;
686+
}
687+
PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor), false,
688+
"Tensor %s contains Inf", var_name);
689+
PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor), false,
690+
"Tensor %s contains NAN", var_name);
691+
}
692+
658693
if (need_to_push_sparse_) {
659694
// push gradients here
660695
for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();

paddle/fluid/framework/trainer_desc.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ message TrainerDesc {
4242

4343
optional int32 mpi_size = 16 [ default = -1 ];
4444
optional int32 dump_file_num = 17 [ default = 16 ];
45+
repeated string check_nan_var_names = 18;
4546

4647
// device worker parameters
4748
optional HogwildWorkerParameter hogwild_param = 101;

python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,8 @@ def _minimize(self,
248248
opt_info["use_cvm"] = strategy.get("use_cvm", False)
249249
opt_info["stat_var_names"] = strategy.get("stat_var_names", [])
250250
opt_info["scale_datanorm"] = strategy.get("scale_datanorm", -1)
251+
opt_info["check_nan_var_names"] = strategy.get("check_nan_var_names",
252+
[])
251253
opt_info["dump_slot"] = False
252254
opt_info["dump_converter"] = ""
253255
opt_info["dump_fields"] = strategy.get("dump_fields", [])

python/paddle/fluid/trainer_desc.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,10 @@ def _set_dump_file_num(self, dump_file_num):
100100
def _set_dump_converter(self, converter):
101101
self.proto_desc.dump_converter = converter
102102

103+
def _set_check_nan_var_names(self, check_nan_var_names):
104+
for var in check_nan_var_names:
105+
self.proto_desc.check_nan_var_names.append(var)
106+
103107
def _set_adjust_ins_weight(self, config_dict):
104108
self.proto_desc.adjust_ins_weight_config.need_adjust = \
105109
config_dict.get("need_adjust", False)

python/paddle/fluid/trainer_factory.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ def _create_trainer(self, opt_info=None):
5353
trainer._set_dump_file_num(opt_info["dump_file_num"])
5454
trainer._set_dump_converter(opt_info["dump_converter"])
5555
trainer._set_adjust_ins_weight(opt_info["adjust_ins_weight"])
56+
trainer._set_check_nan_var_names(opt_info[
57+
"check_nan_var_names"])
5658
trainer._set_device_worker(device_worker)
5759
return trainer
5860

0 commit comments

Comments
 (0)