Skip to content

Commit c3c8599

Browse files
authored
[Other] Optimize codes using list comprehension, implement GraphCompilerBackend for paddle test_compiler. (#327)
* Optimize codes using list comprehension. * Implement graph compiler backend for paddle. * Fix the init dtype.
1 parent 7ace5ce commit c3c8599

File tree

6 files changed

+110
-81
lines changed

6 files changed

+110
-81
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import paddle
2+
from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
3+
4+
5+
class CinnBackend(GraphCompilerBackend):
6+
def __call__(self, model, input_spec=None):
7+
build_strategy = paddle.static.BuildStrategy()
8+
compiled_model = paddle.jit.to_static(
9+
model,
10+
input_spec=input_spec,
11+
build_strategy=build_strategy,
12+
full_graph=True,
13+
)
14+
compiled_model.eval()
15+
program = compiled_model.forward.concrete_program.main_program
16+
return compiled_model
17+
18+
def synchronize(self):
19+
if (
20+
paddle.device.is_compiled_with_cuda()
21+
or paddle.device.is_compiled_with_rocm()
22+
):
23+
paddle.device.synchronize()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
class GraphCompilerBackend:
2+
def __call__(self, model, input_spec=None):
3+
raise NotImplementedError()
4+
5+
def synchronize(self):
6+
raise NotImplementedError()
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import paddle
2+
from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
3+
4+
5+
class NopeBackend(GraphCompilerBackend):
6+
def __call__(self, model, input_spec=None):
7+
return model
8+
9+
def synchronize(self):
10+
if (
11+
paddle.device.is_compiled_with_cuda()
12+
or paddle.device.is_compiled_with_rocm()
13+
):
14+
paddle.device.synchronize()

graph_net/paddle/test_compiler.py

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,21 @@
1717
from graph_net import path_utils
1818
from graph_net import test_compiler_util
1919

20+
from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
21+
from graph_net.paddle.backend.cinn_backend import CinnBackend
22+
from graph_net.paddle.backend.nope_backend import NopeBackend
23+
24+
25+
registry_backend = {
26+
"cinn": CinnBackend(),
27+
"nope": NopeBackend(),
28+
}
29+
30+
31+
def get_compiler_backend(args) -> GraphCompilerBackend:
32+
assert args.compiler in registry_backend, f"Unknown compiler: {args.compiler}"
33+
return registry_backend[args.compiler]
34+
2035

2136
def set_seed(random_seed):
2237
paddle.seed(random_seed)
@@ -60,10 +75,6 @@ def load_class_from_file(file_path: str, class_name: str):
6075
return model_class
6176

6277

63-
def get_synchronizer_func(args):
64-
return paddle.device.synchronize
65-
66-
6778
def get_model(model_path):
6879
model_class = load_class_from_file(
6980
f"{model_path}/model.py", class_name="GraphModule"
@@ -91,22 +102,6 @@ def get_input_spec(model_path):
91102
return input_spec
92103

93104

94-
def get_compiled_model(args, model):
95-
if args.compiler == "nope":
96-
return model
97-
input_spec = get_input_spec(args.model_path)
98-
build_strategy = paddle.static.BuildStrategy()
99-
compiled_model = paddle.jit.to_static(
100-
model,
101-
input_spec=input_spec,
102-
build_strategy=build_strategy,
103-
full_graph=True,
104-
)
105-
compiled_model.eval()
106-
program = compiled_model.forward.concrete_program.main_program
107-
return compiled_model
108-
109-
110105
def get_static_model(args, model):
111106
static_model = paddle.jit.to_static(
112107
model,
@@ -119,7 +114,7 @@ def get_static_model(args, model):
119114
return static_model
120115

121116

122-
def measure_performance(model_call, args, synchronizer_func, profile=False):
117+
def measure_performance(model_call, args, compiler, profile=False):
123118
runtime_seed = 1024
124119
stats = {}
125120

@@ -129,7 +124,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
129124
# Warmup runs
130125
for _ in range(args.warmup):
131126
model_call()
132-
synchronizer_func()
127+
compiler.synchronize()
133128

134129
hardware_name = get_hardward_name(args)
135130
print(
@@ -152,7 +147,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
152147
for i in range(args.trials):
153148
# End-to-end timing (naive_timer)
154149
duration_box = test_compiler_util.DurationBox(-1)
155-
with test_compiler_util.naive_timer(duration_box, synchronizer_func):
150+
with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
156151
# GPU-only timing (CUDA Events)
157152
start_event = paddle.device.Event(enable_timing=True)
158153
end_event = paddle.device.Event(enable_timing=True)
@@ -178,7 +173,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
178173
e2e_times = []
179174
for i in range(args.trials):
180175
duration_box = test_compiler_util.DurationBox(-1)
181-
with test_compiler_util.naive_timer(duration_box, synchronizer_func):
176+
with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
182177
model_call()
183178
print(
184179
f"Trial {i + 1}: e2e={duration_box.value:.4f} ms",
@@ -247,8 +242,25 @@ def transfer_to_float(origin_outputs):
247242
)
248243

249244

245+
def check_and_print_gpu_utilization(compiler):
246+
if paddle.device.is_compiled_with_cuda():
247+
device_id = int(paddle.device.get_device().split(":")[-1])
248+
device_count = paddle.device.cuda.device_count()
249+
gpu_util, mem_util = test_compiler_util.get_device_utilization(
250+
device_id, device_count, compiler.synchronize
251+
)
252+
if gpu_util is not None and mem_util is not None:
253+
print(
254+
f"Device status: gpu_id {device_id}, gpu_util {gpu_util:.2f}%, mem_util {mem_util:.2f}%",
255+
file=sys.stderr,
256+
flush=True,
257+
)
258+
259+
250260
def test_single_model(args):
251-
synchronizer_func = get_synchronizer_func(args)
261+
compiler = get_compiler_backend(args)
262+
check_and_print_gpu_utilization(compiler)
263+
252264
input_dict = get_input_dict(args.model_path)
253265
model = get_model(args.model_path)
254266
model.eval()
@@ -264,7 +276,7 @@ def test_single_model(args):
264276
print("Run model in eager mode.", file=sys.stderr, flush=True)
265277
static_model = get_static_model(args, model)
266278
expected_out, eager_time_stats = measure_performance(
267-
lambda: static_model(**input_dict), args, synchronizer_func, profile=False
279+
lambda: static_model(**input_dict), args, compiler, profile=False
268280
)
269281
eager_success = True
270282
except Exception as e:
@@ -279,9 +291,10 @@ def test_single_model(args):
279291
compiled_time_stats = {}
280292
try:
281293
print("Run model in compiled mode.", file=sys.stderr, flush=True)
282-
compiled_model = get_compiled_model(args, model)
294+
input_spec = get_input_spec(args.model_path)
295+
compiled_model = compiler(model, input_spec)
283296
compiled_out, compiled_time_stats = measure_performance(
284-
lambda: compiled_model(**input_dict), args, synchronizer_func, profile=False
297+
lambda: compiled_model(**input_dict), args, compiler, profile=False
285298
)
286299
compiled_success = True
287300
except Exception as e:
@@ -415,18 +428,6 @@ def main(args):
415428
set_seed(random_seed=initalize_seed)
416429

417430
if path_utils.is_single_model_dir(args.model_path):
418-
if paddle.device.is_compiled_with_cuda():
419-
device_id = int(paddle.device.get_device().split(":")[-1])
420-
device_count = paddle.device.cuda.device_count()
421-
gpu_util, mem_util = test_compiler_util.get_device_utilization(
422-
device_id, device_count, get_synchronizer_func(args)
423-
)
424-
if gpu_util is not None and mem_util is not None:
425-
print(
426-
f"Device status: gpu_id {device_id}, gpu_util {gpu_util:.2f}%, mem_util {mem_util:.2f}%",
427-
file=sys.stderr,
428-
flush=True,
429-
)
430431
test_single_model(args)
431432
else:
432433
test_multi_models(args)

graph_net/paddle/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def replay_tensor(info):
213213
).to(device)
214214
else:
215215
if mean is not None and std is not None:
216-
tensor = paddle.empty(shape=shape, dtype=dtype)
216+
tensor = paddle.empty(shape=shape, dtype="float32")
217217
initializer = paddle.nn.initializer.TruncatedNormal(
218218
mean=mean, std=std, a=min_val, b=max_val
219219
)

graph_net/test_compiler_util.py

Lines changed: 25 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -53,29 +53,17 @@ def get_device_utilization(device_id, device_count, synchronizer_func):
5353
synchronizer_func()
5454
time.sleep(1)
5555

56-
output = (
57-
subprocess.check_output(
58-
[
59-
"nvidia-smi",
60-
f"--query-gpu=index,gpu_uuid,utilization.gpu,memory.used,memory.total",
61-
"--format=csv,noheader,nounits",
62-
]
63-
)
64-
.decode()
65-
.strip()
56+
cmd = [
57+
"nvidia-smi",
58+
f"--query-gpu=index,gpu_uuid,utilization.gpu,memory.used,memory.total",
59+
"--format=csv,noheader,nounits",
60+
]
61+
output = subprocess.check_output(cmd).decode().strip()
62+
_, selected_gpu_uuid, gpu_util, used_mem, mem_total = next(
63+
line.split(", ")
64+
for line in output.split("\n")
65+
if line.strip() and int(line.split(", ")[0]) == selected_gpu_id
6666
)
67-
for line in output.split("\n"):
68-
if line.strip():
69-
(
70-
gpu_id,
71-
selected_gpu_uuid,
72-
gpu_util,
73-
used_mem,
74-
mem_total,
75-
) = line.split(", ")
76-
if int(gpu_id) == selected_gpu_id:
77-
break
78-
7967
gpu_util = float(gpu_util)
8068
mem_util = float(used_mem) * 100 / float(mem_total)
8169
print(
@@ -88,22 +76,19 @@ def get_device_utilization(device_id, device_count, synchronizer_func):
8876
max_mem_util = mem_util if mem_util > max_mem_util else max_mem_util
8977

9078
other_tasks = []
91-
output = (
92-
subprocess.check_output(
93-
[
94-
"nvidia-smi",
95-
f"--query-compute-apps=gpu_uuid,pid,used_memory",
96-
"--format=csv,noheader,nounits",
97-
]
98-
)
99-
.decode()
100-
.strip()
101-
)
102-
for line in output.split("\n"):
103-
if line.strip():
104-
gpu_uuid, pid, used_memory = line.split(", ")
105-
if gpu_uuid == selected_gpu_uuid and int(pid) != current_pid:
106-
other_tasks.append(line)
79+
cmd = [
80+
"nvidia-smi",
81+
f"--query-compute-apps=gpu_uuid,pid,used_memory",
82+
"--format=csv,noheader,nounits",
83+
]
84+
output = subprocess.check_output(cmd).decode().strip()
85+
other_tasks = [
86+
line
87+
for line in output.split("\n")
88+
if line.strip()
89+
and (line.split(", ")[0] == selected_gpu_uuid)
90+
and (line.split(", ")[1] != current_pid)
91+
]
10792
# Note: in docker container, the current_pid maybe different from that captured by nvidia-smi.
10893
print(
10994
f"Note: There are {len(other_tasks)} tasks running on GPU {selected_gpu_id} (current_pid:{current_pid}).",
@@ -195,11 +180,11 @@ def convert_to_str(b):
195180

196181

197182
def print_times_and_speedup(args, eager_stats, compiled_stats):
198-
if not eager_stats:
183+
if eager_stats:
199184
print_with_log_prompt(
200185
"[Performance][eager]:", json.dumps(eager_stats), args.log_prompt
201186
)
202-
if not compiled_stats:
187+
if compiled_stats:
203188
print_with_log_prompt(
204189
"[Performance][compiled]:", json.dumps(compiled_stats), args.log_prompt
205190
)

0 commit comments

Comments
 (0)