Skip to content

Commit 3cb6395

Browse files
committed
better profiler and benchmark
1 parent 38af7bc commit 3cb6395

File tree

4 files changed

+66
-42
lines changed

4 files changed

+66
-42
lines changed

benchmark/fluid/fluid_benchmark.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ def parse_args():
9898
'--use_fake_data',
9999
action='store_true',
100100
help='If set ommit the actual read data operators.')
101+
parser.add_argument(
102+
'--profile', action='store_true', help='If set, profile a few steps.')
101103
parser.add_argument(
102104
'--update_method',
103105
type=str,
@@ -108,8 +110,8 @@ def parse_args():
108110
return args
109111

110112

111-
def append_nccl2_prepare():
112-
if os.getenv("PADDLE_TRAINER_ID", None) != None:
113+
def append_nccl2_prepare(trainer_id):
114+
if trainer_id >= 0:
113115
# append gen_nccl_id at the end of startup program
114116
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
115117
port = os.getenv("PADDLE_PSERVER_PORT")
@@ -136,12 +138,12 @@ def append_nccl2_prepare():
136138
})
137139
return nccl_id_var, num_trainers, trainer_id
138140
else:
139-
raise Exception(
140-
"must set PADDLE_TRAINER_ID env variables for dist train.")
141+
raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
142+
"nccl-based dist train.")
141143

142144

143-
def dist_transpile():
144-
if "PADDLE_TRAINING_ROLE" not in os.environ:
145+
def dist_transpile(trainer_id):
146+
if trainer_id < 0:
145147
return None, None
146148

147149
# the port of all pservers, needed by both trainer and pserver
@@ -158,9 +160,6 @@ def dist_transpile():
158160
trainers = int(os.getenv("PADDLE_TRAINERS"))
159161
# the IP of the local machine, needed by pserver only
160162
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
161-
# the unique trainer id, starting from 0, needed by trainer
162-
# only
163-
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
164163
# the role, should be either PSERVER or TRAINER
165164
training_role = os.getenv("PADDLE_TRAINING_ROLE")
166165

@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
295294
iters = 0
296295
start_time = time.time()
297296
for batch_id, data in enumerate(train_reader()):
297+
if args.profile and pass_id == 0 and batch_id == 5:
298+
profiler.start_profiler("All")
299+
elif args.profile and pass_id == 0 and batch_id == 10:
300+
profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
301+
298302
if iters == args.skip_batch_num:
299303
start_time = time.time()
300304
num_samples = 0
@@ -334,7 +338,11 @@ def print_arguments(args):
334338
def main():
335339
args = parse_args()
336340
print_arguments(args)
337-
nccl_id_var, num_trainers, trainer_id = None, 1, 0
341+
342+
# the unique trainer id, starting from 0, needed by trainer
343+
# only
344+
nccl_id_var, num_trainers, trainer_id = (
345+
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
338346

339347
if args.use_cprof:
340348
pr = cProfile.Profile()
@@ -348,7 +356,7 @@ def main():
348356
fluid.memory_optimize(fluid.default_main_program())
349357

350358
if args.update_method == "pserver":
351-
train_prog, startup_prog = dist_transpile()
359+
train_prog, startup_prog = dist_transpile(trainer_id)
352360
if not train_prog:
353361
raise Exception(
354362
"Must configure correct environments to run dist train.")
@@ -364,7 +372,7 @@ def main():
364372
train_args.append(fluid.default_startup_program())
365373

366374
if args.update_method == "nccl2":
367-
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare()
375+
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
368376
if args.gpus == 1:
369377
# NOTE: parallel executor use profiler interanlly
370378
if args.use_nvprof and args.device == 'GPU':

paddle/fluid/operators/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,8 @@ if(NOT WITH_MKLDNN)
272272
list(REMOVE_ITEM GENERAL_OPS fc_op)
273273
endif(NOT WITH_MKLDNN)
274274

275+
list(REMOVE_ITEM GENERAL_OPS reduce_op)
276+
275277
foreach(src ${GENERAL_OPS})
276278
op_library(${src})
277279
endforeach()

paddle/fluid/platform/profiler.cc

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ struct EventList;
3838

3939
static int64_t profiler_lister_id = 0;
4040
static bool should_send_profile_state = false;
41+
std::mutex profiler_mu;
4142

4243
// The profiler state, the initial value is ProfilerState::kDisabled
4344
static ProfilerState g_state = ProfilerState::kDisabled;
@@ -228,11 +229,13 @@ void EnableProfiler(ProfilerState state) {
228229
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
229230
"Can't enbale profling, since the input state is ",
230231
"ProfilerState::kDisabled");
232+
233+
std::lock_guard<std::mutex> l(profiler_mu);
231234
if (state == g_state) {
232235
return;
233236
}
234237
g_state = state;
235-
should_send_profile_state = true;
238+
{ should_send_profile_state = true; }
236239
GetDeviceTracer()->Enable();
237240
#ifdef PADDLE_WITH_CUDA
238241
if (g_state == ProfilerState::kCUDA) {
@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
295298
} else if (g_state == ProfilerState::kAll) {
296299
place = "All";
297300
} else {
298-
PADDLE_THROW("Invalid profiler state");
301+
PADDLE_THROW("Invalid profiler state", g_state);
299302
}
300303

301304
std::cout << "Place: " << place << std::endl;
@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
443446

444447
void DisableProfiler(EventSortingKey sorted_key,
445448
const std::string& profile_path) {
449+
std::lock_guard<std::mutex> l(profiler_mu);
446450
if (g_state == ProfilerState::kDisabled) return;
447451
// Mark the profiling stop.
448452
Mark("_stop_profiler_", nullptr);
@@ -456,7 +460,7 @@ void DisableProfiler(EventSortingKey sorted_key,
456460
tracer->GenProfile(profile_path);
457461
}
458462
g_state = ProfilerState::kDisabled;
459-
should_send_profile_state = true;
463+
{ should_send_profile_state = true; }
460464
}
461465

462466
bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
@@ -466,7 +470,7 @@ void SetProfileListener() {
466470
std::mt19937 rng;
467471
rng.seed(std::random_device()());
468472
std::uniform_int_distribution<std::mt19937::result_type> dist6(
469-
1, std::numeric_limits<std::mt19937::result_type>::max());
473+
1, std::numeric_limits<int>::max());
470474
profiler_lister_id = dist6(rng);
471475
}
472476
int64_t ListenerId() { return profiler_lister_id; }

python/paddle/fluid/profiler.py

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
from contextlib import contextmanager
1717
import os
1818

19-
__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
19+
__all__ = [
20+
'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
21+
'stop_profiler'
22+
]
2023

2124
NVPROF_CONFIG = [
2225
"gpustarttimestamp",
@@ -72,6 +75,36 @@ def reset_profiler():
7275
core.reset_profiler()
7376

7477

78+
def start_profiler(state):
79+
if state not in ['CPU', 'GPU', "All"]:
80+
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
81+
if state == "GPU":
82+
prof_state = core.ProfilerState.kCUDA
83+
elif state == "CPU":
84+
prof_state = core.ProfilerState.kCPU
85+
else:
86+
prof_state = core.ProfilerState.kAll
87+
core.enable_profiler(prof_state)
88+
89+
90+
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
91+
sorted_key = 'default' if sorted_key is None else sorted_key
92+
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
93+
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
94+
"'max', 'min' and 'ave'")
95+
key_map = {
96+
'default': core.EventSortingKey.kDefault,
97+
'calls': core.EventSortingKey.kCalls,
98+
'total': core.EventSortingKey.kTotal,
99+
'max': core.EventSortingKey.kMax,
100+
'min': core.EventSortingKey.kMin,
101+
'ave': core.EventSortingKey.kAve,
102+
}
103+
# TODO(qingqing) : redirect C++ ostream to Python stream.
104+
# with core.ostream_redirect(stdout=True, stderr=True):
105+
core.disable_profiler(key_map[sorted_key], profile_path)
106+
107+
75108
@contextmanager
76109
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
77110
"""The profiler interface.
@@ -98,29 +131,6 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
98131
profile_path (string) : If state == 'All', it will write a profile
99132
proto output file.
100133
"""
101-
if state not in ['CPU', 'GPU', "All"]:
102-
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
103-
if state == "GPU":
104-
prof_state = core.ProfilerState.kCUDA
105-
elif state == "CPU":
106-
prof_state = core.ProfilerState.kCPU
107-
else:
108-
prof_state = core.ProfilerState.kAll
109-
core.enable_profiler(prof_state)
134+
start_profiler(state)
110135
yield
111-
112-
sorted_key = 'default' if sorted_key is None else sorted_key
113-
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
114-
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
115-
"'max', 'min' and 'ave'")
116-
key_map = {
117-
'default': core.EventSortingKey.kDefault,
118-
'calls': core.EventSortingKey.kCalls,
119-
'total': core.EventSortingKey.kTotal,
120-
'max': core.EventSortingKey.kMax,
121-
'min': core.EventSortingKey.kMin,
122-
'ave': core.EventSortingKey.kAve,
123-
}
124-
# TODO(qingqing) : redirect C++ ostream to Python stream.
125-
# with core.ostream_redirect(stdout=True, stderr=True):
126-
core.disable_profiler(key_map[sorted_key], profile_path)
136+
stop_profiler(sorted_key, profile_path)

0 commit comments

Comments
 (0)