Skip to content

Commit 35d5b1b

Browse files
authored
Merge pull request #11036 from panyx0718/dist_timeline
better profiler and benchmark
2 parents 32d5086 + f14e579 commit 35d5b1b

File tree

4 files changed

+87
-38
lines changed

4 files changed

+87
-38
lines changed

benchmark/fluid/fluid_benchmark.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ def parse_args():
9898
'--use_fake_data',
9999
action='store_true',
100100
help='If set ommit the actual read data operators.')
101+
parser.add_argument(
102+
'--profile', action='store_true', help='If set, profile a few steps.')
101103
parser.add_argument(
102104
'--update_method',
103105
type=str,
@@ -108,8 +110,8 @@ def parse_args():
108110
return args
109111

110112

111-
def append_nccl2_prepare():
112-
if os.getenv("PADDLE_TRAINER_ID", None) != None:
113+
def append_nccl2_prepare(trainer_id):
114+
if trainer_id >= 0:
113115
# append gen_nccl_id at the end of startup program
114116
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
115117
port = os.getenv("PADDLE_PSERVER_PORT")
@@ -136,12 +138,12 @@ def append_nccl2_prepare():
136138
})
137139
return nccl_id_var, num_trainers, trainer_id
138140
else:
139-
raise Exception(
140-
"must set PADDLE_TRAINER_ID env variables for dist train.")
141+
raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
142+
"nccl-based dist train.")
141143

142144

143-
def dist_transpile():
144-
if "PADDLE_TRAINING_ROLE" not in os.environ:
145+
def dist_transpile(trainer_id):
146+
if trainer_id < 0:
145147
return None, None
146148

147149
# the port of all pservers, needed by both trainer and pserver
@@ -158,9 +160,6 @@ def dist_transpile():
158160
trainers = int(os.getenv("PADDLE_TRAINERS"))
159161
# the IP of the local machine, needed by pserver only
160162
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
161-
# the unique trainer id, starting from 0, needed by trainer
162-
# only
163-
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
164163
# the role, should be either PSERVER or TRAINER
165164
training_role = os.getenv("PADDLE_TRAINING_ROLE")
166165

@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
295294
iters = 0
296295
start_time = time.time()
297296
for batch_id, data in enumerate(train_reader()):
297+
if args.profile and pass_id == 0 and batch_id == 5:
298+
profiler.start_profiler("All")
299+
elif args.profile and pass_id == 0 and batch_id == 10:
300+
profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
301+
298302
if iters == args.skip_batch_num:
299303
start_time = time.time()
300304
num_samples = 0
@@ -334,7 +338,11 @@ def print_arguments(args):
334338
def main():
335339
args = parse_args()
336340
print_arguments(args)
337-
nccl_id_var, num_trainers, trainer_id = None, 1, 0
341+
342+
# the unique trainer id, starting from 0, needed by trainer
343+
# only
344+
nccl_id_var, num_trainers, trainer_id = (
345+
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
338346

339347
if args.use_cprof:
340348
pr = cProfile.Profile()
@@ -348,7 +356,7 @@ def main():
348356
fluid.memory_optimize(fluid.default_main_program())
349357

350358
if args.update_method == "pserver":
351-
train_prog, startup_prog = dist_transpile()
359+
train_prog, startup_prog = dist_transpile(trainer_id)
352360
if not train_prog:
353361
raise Exception(
354362
"Must configure correct environments to run dist train.")
@@ -364,7 +372,7 @@ def main():
364372
train_args.append(fluid.default_startup_program())
365373

366374
if args.update_method == "nccl2":
367-
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare()
375+
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
368376
if args.gpus == 1:
369377
# NOTE: parallel executor use profiler interanlly
370378
if args.use_nvprof and args.device == 'GPU':

paddle/fluid/platform/profiler.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ struct EventList;
3838

3939
static int64_t profiler_lister_id = 0;
4040
static bool should_send_profile_state = false;
41+
std::mutex profiler_mu;
4142

4243
// The profiler state, the initial value is ProfilerState::kDisabled
4344
static ProfilerState g_state = ProfilerState::kDisabled;
@@ -228,6 +229,8 @@ void EnableProfiler(ProfilerState state) {
228229
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
229230
"Can't enbale profling, since the input state is ",
230231
"ProfilerState::kDisabled");
232+
233+
std::lock_guard<std::mutex> l(profiler_mu);
231234
if (state == g_state) {
232235
return;
233236
}
@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
295298
} else if (g_state == ProfilerState::kAll) {
296299
place = "All";
297300
} else {
298-
PADDLE_THROW("Invalid profiler state");
301+
PADDLE_THROW("Invalid profiler state", g_state);
299302
}
300303

301304
std::cout << "Place: " << place << std::endl;
@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
443446

444447
void DisableProfiler(EventSortingKey sorted_key,
445448
const std::string& profile_path) {
449+
std::lock_guard<std::mutex> l(profiler_mu);
446450
if (g_state == ProfilerState::kDisabled) return;
447451
// Mark the profiling stop.
448452
Mark("_stop_profiler_", nullptr);
@@ -466,7 +470,7 @@ void SetProfileListener() {
466470
std::mt19937 rng;
467471
rng.seed(std::random_device()());
468472
std::uniform_int_distribution<std::mt19937::result_type> dist6(
469-
1, std::numeric_limits<std::mt19937::result_type>::max());
473+
1, std::numeric_limits<int>::max());
470474
profiler_lister_id = dist6(rng);
471475
}
472476
int64_t ListenerId() { return profiler_lister_id; }

paddle/fluid/pybind/pybind.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,7 @@ All parameter, weight, gradient are variables in Paddle.
495495

496496
m.def("enable_profiler", platform::EnableProfiler);
497497
m.def("disable_profiler", platform::DisableProfiler);
498+
m.def("is_profiler_enabled", platform::IsProfileEnabled);
498499
m.def("reset_profiler", platform::ResetProfiler);
499500

500501
// -- python binds for parallel executor.

python/paddle/fluid/profiler.py

Lines changed: 60 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
from contextlib import contextmanager
1717
import os
1818

19-
__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
19+
__all__ = [
20+
'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
21+
'stop_profiler'
22+
]
2023

2124
NVPROF_CONFIG = [
2225
"gpustarttimestamp",
@@ -72,20 +75,31 @@ def reset_profiler():
7275
core.reset_profiler()
7376

7477

75-
@contextmanager
76-
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
77-
"""The profiler interface.
78-
Different from cuda_profiler, this profiler can be used to profile both CPU
79-
and GPU program. By defalut, it records the CPU and GPU operator kernels,
80-
if you want to profile other program, you can refer the profiling tutorial
81-
to add more records.
78+
def start_profiler(state):
79+
"""Enable the profiler.
80+
81+
Args:
82+
state (string) : The profiling state, which should be 'CPU', 'GPU'
83+
or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
84+
GPU as well. 'All' also generates timeline.
85+
"""
86+
if core.is_profiler_enabled():
87+
return
88+
if state not in ['CPU', 'GPU', "All"]:
89+
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
90+
if state == "GPU":
91+
prof_state = core.ProfilerState.kCUDA
92+
elif state == "CPU":
93+
prof_state = core.ProfilerState.kCPU
94+
else:
95+
prof_state = core.ProfilerState.kAll
96+
core.enable_profiler(prof_state)
97+
98+
99+
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
100+
"""Stop the profiler.
82101
83102
Args:
84-
state (string) : The profiling state, which should be 'CPU' or 'GPU',
85-
telling the profiler to use CPU timer or GPU timer for profiling.
86-
Although users may have already specified the execution place
87-
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
88-
would not inherit this place.
89103
sorted_key (string) : If None, the profiling results will be printed
90104
in the order of first end time of events. Otherwise, the profiling
91105
results will be sorted by the this flag. This flag should be one
@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
98112
profile_path (string) : If state == 'All', it will write a profile
99113
proto output file.
100114
"""
101-
if state not in ['CPU', 'GPU', "All"]:
102-
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
103-
if state == "GPU":
104-
prof_state = core.ProfilerState.kCUDA
105-
elif state == "CPU":
106-
prof_state = core.ProfilerState.kCPU
107-
else:
108-
prof_state = core.ProfilerState.kAll
109-
core.enable_profiler(prof_state)
110-
yield
111-
115+
if not core.is_profiler_enabled():
116+
return
112117
sorted_key = 'default' if sorted_key is None else sorted_key
113118
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
114119
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
124129
# TODO(qingqing) : redirect C++ ostream to Python stream.
125130
# with core.ostream_redirect(stdout=True, stderr=True):
126131
core.disable_profiler(key_map[sorted_key], profile_path)
132+
133+
134+
@contextmanager
135+
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
136+
"""The profiler interface.
137+
Different from cuda_profiler, this profiler can be used to profile both CPU
138+
and GPU program. By defalut, it records the CPU and GPU operator kernels,
139+
if you want to profile other program, you can refer the profiling tutorial
140+
to add more records.
141+
142+
Args:
143+
state (string) : The profiling state, which should be 'CPU' or 'GPU',
144+
telling the profiler to use CPU timer or GPU timer for profiling.
145+
Although users may have already specified the execution place
146+
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
147+
would not inherit this place.
148+
sorted_key (string) : If None, the profiling results will be printed
149+
in the order of first end time of events. Otherwise, the profiling
150+
results will be sorted by the this flag. This flag should be one
151+
of 'calls', 'total', 'max', 'min' or 'ave'.
152+
The `calls` means sorting by the number of calls.
153+
The `total` means sorting by the total execution time.
154+
The `max` means sorting by the maximum execution time.
155+
The `min` means sorting by the minimum execution time.
156+
The `ave` means sorting by the average execution time.
157+
profile_path (string) : If state == 'All', it will write a profile
158+
proto output file.
159+
"""
160+
start_profiler(state)
161+
yield
162+
stop_profiler(sorted_key, profile_path)

0 commit comments

Comments
 (0)