Skip to content

Commit 188bcbb

Browse files
authored
loglevel adjustment for distributed training (#31236)
Change-Id: I6210ce9c60bed48f3323c47b16500302b66cedf2
1 parent b8a593e commit 188bcbb

File tree

10 files changed

+26
-32
lines changed

10 files changed

+26
-32
lines changed

paddle/fluid/distributed/fleet.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,7 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope,
508508
if (name.find("batch_sum") != std::string::npos) {
509509
Variable* var = scope->FindVar(name);
510510
CHECK(var != nullptr) << "var[" << name << "] not found";
511-
VLOG(0) << "prepare shrink dense batch_sum";
511+
VLOG(3) << "prepare shrink dense batch_sum";
512512
LoDTensor* tensor = var->GetMutable<LoDTensor>();
513513
float* g = tensor->data<float>();
514514

paddle/fluid/distributed/service/brpc_ps_server.cc

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,13 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) {
7676
}
7777
}
7878

79-
VLOG(0) << "BrpcPsServer::start registe_ps_server";
8079
_environment->registe_ps_server(ip, port, _rank);
81-
VLOG(0) << "BrpcPsServer::start wait";
8280
cv_.wait(lock, [&] { return stoped_; });
8381

8482
PSHost host;
8583
host.ip = ip;
8684
host.port = port;
8785
host.rank = _rank;
88-
VLOG(0) << "BrpcPsServer::start return host.rank";
8986
return host.rank;
9087
}
9188

@@ -461,7 +458,7 @@ int32_t BrpcPsService::save_one_table(Table *table,
461458

462459
int32_t feasign_size = 0;
463460

464-
VLOG(0) << "save one table " << request.params(0) << " " << request.params(1);
461+
VLOG(3) << "save table " << request.params(0) << " " << request.params(1);
465462
feasign_size = table->save(request.params(0), request.params(1));
466463
if (feasign_size < 0) {
467464
set_response_code(response, -1, "table save failed");
@@ -504,7 +501,7 @@ int32_t BrpcPsService::shrink_table(Table *table,
504501
set_response_code(response, -1, "table shrink failed");
505502
return -1;
506503
}
507-
VLOG(0) << "Pserver Shrink Finished";
504+
VLOG(3) << "Pserver Shrink Finished";
508505
return 0;
509506
}
510507

paddle/fluid/distributed/service/communicator.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ inline double GetCurrentUS() {
5252
Communicator::Communicator() {}
5353

5454
void Communicator::init_gflag(const std::string &gflags) {
55-
VLOG(0) << "Init With Gflags:" << gflags;
55+
VLOG(3) << "Init With Gflags:" << gflags;
5656
std::vector<std::string> flags = paddle::string::split_string(gflags);
5757
if (flags.size() < 1) {
5858
flags.push_back("-max_body_size=314217728");

paddle/fluid/distributed/service/communicator.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,10 @@ class Communicator {
194194
Communicator();
195195

196196
explicit Communicator(const std::map<std::string, std::string> &envs_) {
197-
VLOG(0) << "Communicator Init Envs";
197+
VLOG(3) << "Communicator Init Envs";
198198
for (auto &iter : envs_) {
199199
envs[iter.first] = iter.second;
200-
VLOG(0) << iter.first << ": " << iter.second;
200+
VLOG(3) << iter.first << ": " << iter.second;
201201
}
202202
barrier_table_id_ = std::stoi(envs.at("barrier_table_id"));
203203
trainer_id_ = std::stoi(envs.at("trainer_id"));
@@ -431,7 +431,7 @@ class HalfAsyncCommunicator : public AsyncCommunicator {
431431
need_global_step_ =
432432
static_cast<bool>(std::stoi(envs.at("need_global_step")));
433433

434-
VLOG(0) << "HalfAsyncCommunicator Initialized";
434+
VLOG(1) << "HalfAsyncCommunicator Initialized";
435435
}
436436

437437
void MainThread() override;
@@ -476,7 +476,7 @@ class SyncCommunicator : public HalfAsyncCommunicator {
476476
need_global_step_ =
477477
static_cast<bool>(std::stoi(envs.at("need_global_step")));
478478

479-
VLOG(0) << "SyncCommunicator Initialized";
479+
VLOG(1) << "SyncCommunicator Initialized";
480480
}
481481

482482
void BarrierSend();
@@ -520,7 +520,7 @@ class GeoCommunicator : public AsyncCommunicator {
520520
// id_queue's size
521521
max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
522522
send_queue_size_ = max_merge_var_num_;
523-
VLOG(0) << "GeoCommunicator Initialized";
523+
VLOG(1) << "GeoCommunicator Initialized";
524524
}
525525

526526
void Send(const std::vector<std::string> &var_names,

paddle/fluid/distributed/service/heter_client.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,28 +42,28 @@ void HeterClient::MainThread() {
4242
void HeterClient::Stop() {
4343
running_ = false;
4444
if (!is_initialized_) {
45-
VLOG(0) << "HeterClient is not inited, do nothing";
45+
VLOG(3) << "HeterClient is not inited, do nothing";
4646
} else {
4747
if (main_thread_) {
4848
auto status = StopHeterWorker();
4949
status.wait();
5050
main_thread_->join();
5151
main_thread_.reset(nullptr);
5252
}
53-
VLOG(1) << "HeterClient Stop Done";
53+
VLOG(3) << "HeterClient Stop Done";
5454
}
5555
}
5656

5757
void HeterClient::FinalizeWorker() {
5858
running_ = false;
5959
if (!is_initialized_) {
60-
VLOG(0) << "HeterClient is not inited, do nothing";
60+
VLOG(3) << "HeterClient is not inited, do nothing";
6161
} else {
6262
if (main_thread_) {
6363
main_thread_->join();
6464
main_thread_.reset(nullptr);
6565
}
66-
VLOG(1) << "HeterClient Stop Done";
66+
VLOG(3) << "HeterClient Stop Done";
6767
}
6868
}
6969

paddle/fluid/distributed/service/heter_server.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ int32_t HeterService::stop_heter_worker(const PsRequestMessage& request,
9595
stop_cpu_worker_set_.insert(client_id);
9696
if (stop_cpu_worker_set_.size() == fan_in_) {
9797
is_exit_ = true;
98-
VLOG(0) << "Stop heter Service done.";
98+
VLOG(3) << "Stop heter Service done.";
9999
}
100100
return 0;
101101
}

paddle/fluid/distributed/service/heter_server.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ class HeterServer {
136136
virtual ~HeterServer() {}
137137

138138
void Stop() {
139-
VLOG(0) << "HeterServer Stop()";
139+
VLOG(3) << "HeterServer Stop()";
140140
std::unique_lock<std::mutex> lock(mutex_);
141141
stoped_ = true;
142142
cv_.notify_all();

paddle/fluid/distributed/table/common_dense_table.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ int32_t CommonDenseTable::initialize_optimizer() {
9191
} else {
9292
VLOG(0) << "init optimizer failed";
9393
}
94-
VLOG(0) << "init optimizer " << name << " done";
94+
VLOG(3) << "init optimizer " << name << " done";
9595
return 0;
9696
}
9797

paddle/fluid/distributed/table/common_dense_table.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,12 @@ class CommonDenseTable : public DenseTable {
4545
virtual int32_t set_global_lr(float* lr) override;
4646

4747
int32_t load(const std::string& path, const std::string& param) override {
48-
VLOG(0) << "Dense table may load by "
49-
"paddle.distributed.fleet.init_server";
48+
VLOG(0) << "WARNING: dense variables will load on No.0 trainer";
5049
return 0;
5150
}
5251

5352
int32_t save(const std::string& path, const std::string& param) override {
54-
VLOG(0)
55-
<< "Dense table may be saved by "
56-
"paddle.distributed.fleet.save_persistables/save_inference_model";
53+
VLOG(0) << "WARNING: dense variables will save on No.0 trainer";
5754
return 0;
5855
}
5956

paddle/fluid/distributed/table/common_sparse_table.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
166166
auto id = std::stoull(values[0]);
167167

168168
if (id % pserver_num != pserver_id) {
169-
VLOG(0) << "will not load " << values[0] << " from " << valuepath
169+
VLOG(3) << "will not load " << values[0] << " from " << valuepath
170170
<< ", please check id distribution";
171171
continue;
172172
}
@@ -259,7 +259,7 @@ int32_t CommonSparseTable::initialize_value() {
259259
}
260260
}
261261

262-
VLOG(0) << "has " << feasigns.size() << " ids need to be pre inited";
262+
VLOG(3) << "has " << feasigns.size() << " ids need to be pre inited";
263263

264264
auto buckets = bucket(feasigns.size(), 10);
265265
for (int x = 0; x < 10; ++x) {
@@ -291,10 +291,10 @@ int32_t CommonSparseTable::initialize_optimizer() {
291291
optimizer_ = std::make_shared<SSUM>(value_names_, value_dims_,
292292
value_offsets_, value_idx_);
293293
} else {
294-
VLOG(0) << "init optimizer failed";
294+
VLOG(3) << "init optimizer failed";
295295
}
296296

297-
VLOG(0) << "init optimizer " << name << " done";
297+
VLOG(3) << "init optimizer " << name << " done";
298298
return 0;
299299
}
300300

@@ -307,7 +307,7 @@ int32_t CommonSparseTable::set_global_lr(float* lr) {
307307
int32_t CommonSparseTable::load(const std::string& path,
308308
const std::string& param) {
309309
rwlock_->WRLock();
310-
VLOG(0) << "sparse table load with " << path << " with meta " << param;
310+
VLOG(3) << "sparse table load with " << path << " with meta " << param;
311311
LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_,
312312
&shard_values_);
313313
rwlock_->UNLock();
@@ -318,7 +318,7 @@ int32_t CommonSparseTable::save(const std::string& dirname,
318318
const std::string& param) {
319319
rwlock_->WRLock();
320320
int mode = std::stoi(param);
321-
VLOG(0) << "sparse table save: " << dirname << " mode: " << mode;
321+
VLOG(3) << "sparse table save: " << dirname << " mode: " << mode;
322322

323323
auto varname = _config.common().table_name();
324324
std::string var_store =
@@ -534,11 +534,11 @@ int32_t CommonSparseTable::flush() { return 0; }
534534
int32_t CommonSparseTable::shrink(const std::string& param) {
535535
rwlock_->WRLock();
536536
int threshold = std::stoi(param);
537-
VLOG(0) << "sparse table shrink: " << threshold;
537+
VLOG(3) << "sparse table shrink: " << threshold;
538538

539539
for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
540540
// shrink
541-
VLOG(0) << shard_id << " " << task_pool_size_ << " begin shrink";
541+
VLOG(4) << shard_id << " " << task_pool_size_ << " begin shrink";
542542
shard_values_[shard_id]->Shrink(threshold);
543543
}
544544
rwlock_->UNLock();

0 commit comments

Comments
 (0)