Skip to content

Commit 804a297

Browse files
committed
同步远程代码
2 parents 5a640e2 + f83ad6f commit 804a297

File tree

16 files changed

+623
-127
lines changed

16 files changed

+623
-127
lines changed

graph_net/analysis_util.py

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,10 @@ def print_stat_info(
254254
# pi is a list of constants for t > 0 for each group
255255
pi = [0, 0]
256256

257+
is_correct_at_t1 = [False] * total_samples
258+
speedup_at_t1 = [None] * total_samples
259+
fail_type_at_t1 = ["CORRECT"] * total_samples
260+
257261
final_correct_count = 0
258262
final_correct_negative_speedup_count = 0
259263
final_correct_speedups = []
@@ -291,8 +295,8 @@ def print_stat_info(
291295
get_correctness(eager_dtypes[i], t_key, correctness_data, i)
292296
for i in range(output_count)
293297
)
294-
if not is_correct:
295-
fail_type = "accuracy"
298+
if not is_correct:
299+
fail_type = "accuracy"
296300

297301
# Collect statistics
298302
if is_correct:
@@ -306,6 +310,11 @@ def print_stat_info(
306310
if fail_type == "accuracy":
307311
acc_failure_count += 1
308312

313+
if t_key == 1:
314+
is_correct_at_t1[idx] = is_correct
315+
speedup_at_t1[idx] = speedup
316+
fail_type_at_t1[idx] = fail_type if fail_type is not None else "CORRECT"
317+
309318
# S(t) calculation
310319
if fail_type is not None or speedup is None:
311320
regularized_speedup = fpdb
@@ -320,37 +329,25 @@ def print_stat_info(
320329
# ES(t) calculation: based on state change
321330
rec_speedup_fake_degrad = 0
322331
if t_key < 1:
323-
# When t < 1, ES behaves the same as S
324332
if fail_type is not None or speedup is None:
325333
rec_speedup_fake_degrad = fpdb
326-
# print(f"sample: {sample.get('configuration').get('model')}, fail_type: {fail_type}, rec_speedup_fake_degrad: {rec_speedup_fake_degrad}")
327334
else:
328335
rec_speedup_fake_degrad = (
329336
speedup ** (negative_speedup_penalty + 1)
330337
if speedup < 1
331338
else speedup
332339
)
333340
else:
334-
# When t >= 1, ES starts applying stepwise logic
335-
# ES curve's stepwise state, initialized as 'CORRECT'
336-
es_status = ["CORRECT"] * total_samples
337-
if es_status[idx] == "CORRECT" and fail_type is not None:
338-
es_status[idx] = fail_type
339-
340-
if (
341-
es_status[idx] is not None
342-
and es_status[idx] != "CORRECT"
343-
or speedup is None
344-
):
341+
if not is_correct_at_t1[idx] or speedup_at_t1[idx] is None:
342+
fail_type_frozen = fail_type_at_t1[idx]
345343
rec_speedup_fake_degrad = fake_perf_degrad(
346-
t_key, es_status[idx], fpdb
344+
t_key, fail_type_frozen, fpdb
347345
)
348-
# print(f"sample: {sample.get('configuration').get('model')}, error type: {es_status[idx]}, rec_speedup_fake_degrad: {rec_speedup_fake_degrad}")
349-
else: # Still in a "CORRECT" state
346+
else:
350347
rec_speedup_fake_degrad = (
351-
speedup ** (negative_speedup_penalty + 1)
352-
if speedup < 1
353-
else speedup
348+
speedup_at_t1[idx] ** (negative_speedup_penalty + 1)
349+
if speedup_at_t1[idx] < 1
350+
else speedup_at_t1[idx]
354351
)
355352
rectified_speedups_fake_degrad.append(rec_speedup_fake_degrad)
356353

@@ -399,4 +396,3 @@ def print_stat_info(
399396
print(f" - pi: {pi}")
400397

401398
return s_scores, s_scores_fake_degrad
402-
return s_scores, es_scores
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import paddle
2+
from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
3+
4+
5+
class CinnBackend(GraphCompilerBackend):
6+
def __call__(self, model, input_spec=None):
7+
build_strategy = paddle.static.BuildStrategy()
8+
compiled_model = paddle.jit.to_static(
9+
model,
10+
input_spec=input_spec,
11+
build_strategy=build_strategy,
12+
full_graph=True,
13+
)
14+
compiled_model.eval()
15+
program = compiled_model.forward.concrete_program.main_program
16+
return compiled_model
17+
18+
def synchronize(self):
19+
if (
20+
paddle.device.is_compiled_with_cuda()
21+
or paddle.device.is_compiled_with_rocm()
22+
):
23+
paddle.device.synchronize()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
class GraphCompilerBackend:
2+
def __call__(self, model, input_spec=None):
3+
raise NotImplementedError()
4+
5+
def synchronize(self):
6+
raise NotImplementedError()
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import paddle
2+
from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
3+
4+
5+
class NopeBackend(GraphCompilerBackend):
6+
def __call__(self, model, input_spec=None):
7+
return model
8+
9+
def synchronize(self):
10+
if (
11+
paddle.device.is_compiled_with_cuda()
12+
or paddle.device.is_compiled_with_rocm()
13+
):
14+
paddle.device.synchronize()

graph_net/paddle/test_compiler.py

Lines changed: 60 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,21 @@
1717
from graph_net import path_utils
1818
from graph_net import test_compiler_util
1919

20+
from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
21+
from graph_net.paddle.backend.cinn_backend import CinnBackend
22+
from graph_net.paddle.backend.nope_backend import NopeBackend
23+
24+
25+
registry_backend = {
26+
"cinn": CinnBackend(),
27+
"nope": NopeBackend(),
28+
}
29+
30+
31+
def get_compiler_backend(args) -> GraphCompilerBackend:
32+
assert args.compiler in registry_backend, f"Unknown compiler: {args.compiler}"
33+
return registry_backend[args.compiler]
34+
2035

2136
def set_seed(random_seed):
2237
paddle.seed(random_seed)
@@ -25,7 +40,7 @@ def set_seed(random_seed):
2540

2641

2742
def get_hardward_name(args):
28-
if args.device == "cuda":
43+
if test_compiler_util.is_gpu_device(args.device):
2944
hardware = paddle.device.cuda.get_device_name(0)
3045
elif args.device == "cpu":
3146
hardware = platform.processor()
@@ -60,19 +75,15 @@ def load_class_from_file(file_path: str, class_name: str):
6075
return model_class
6176

6277

63-
def get_synchronizer_func(args):
64-
return paddle.device.synchronize
65-
66-
67-
def get_model(args):
78+
def get_model(model_path):
6879
model_class = load_class_from_file(
69-
f"{args.model_path}/model.py", class_name="GraphModule"
80+
f"{model_path}/model.py", class_name="GraphModule"
7081
)
7182
return model_class()
7283

7384

74-
def get_input_dict(args):
75-
inputs_params = utils.load_converted_from_text(f"{args.model_path}")
85+
def get_input_dict(model_path):
86+
inputs_params = utils.load_converted_from_text(f"{model_path}")
7687
params = inputs_params["weight_info"]
7788
inputs = inputs_params["input_info"]
7889

@@ -81,8 +92,8 @@ def get_input_dict(args):
8192
return state_dict
8293

8394

84-
def get_input_spec(args):
85-
inputs_params_list = utils.load_converted_list_from_text(f"{args.model_path}")
95+
def get_input_spec(model_path):
96+
inputs_params_list = utils.load_converted_list_from_text(f"{model_path}")
8697
input_spec = [None] * len(inputs_params_list)
8798
for i, v in enumerate(inputs_params_list):
8899
dtype = v["info"]["dtype"]
@@ -91,26 +102,10 @@ def get_input_spec(args):
91102
return input_spec
92103

93104

94-
def get_compiled_model(args, model):
95-
if args.compiler == "nope":
96-
return model
97-
input_spec = get_input_spec(args)
98-
build_strategy = paddle.static.BuildStrategy()
99-
compiled_model = paddle.jit.to_static(
100-
model,
101-
input_spec=input_spec,
102-
build_strategy=build_strategy,
103-
full_graph=True,
104-
)
105-
compiled_model.eval()
106-
program = compiled_model.forward.concrete_program.main_program
107-
return compiled_model
108-
109-
110105
def get_static_model(args, model):
111106
static_model = paddle.jit.to_static(
112107
model,
113-
input_spec=get_input_spec(args),
108+
input_spec=get_input_spec(args.model_path),
114109
full_graph=True,
115110
backend=None,
116111
)
@@ -119,7 +114,7 @@ def get_static_model(args, model):
119114
return static_model
120115

121116

122-
def measure_performance(model_call, args, synchronizer_func, profile=False):
117+
def measure_performance(model_call, args, compiler, profile=False):
123118
runtime_seed = 1024
124119
stats = {}
125120

@@ -129,7 +124,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
129124
# Warmup runs
130125
for _ in range(args.warmup):
131126
model_call()
132-
synchronizer_func()
127+
compiler.synchronize()
133128

134129
hardware_name = get_hardward_name(args)
135130
print(
@@ -138,7 +133,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
138133
flush=True,
139134
)
140135

141-
if "cuda" in args.device:
136+
if test_compiler_util.is_gpu_device(args.device):
142137
"""
143138
Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
144139
With reference to methods only based on CUDA events from KernelBench in https://github.com/ScalingIntelligence/KernelBench
@@ -152,7 +147,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
152147
for i in range(args.trials):
153148
# End-to-end timing (naive_timer)
154149
duration_box = test_compiler_util.DurationBox(-1)
155-
with test_compiler_util.naive_timer(duration_box, synchronizer_func):
150+
with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
156151
# GPU-only timing (CUDA Events)
157152
start_event = paddle.device.Event(enable_timing=True)
158153
end_event = paddle.device.Event(enable_timing=True)
@@ -178,7 +173,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
178173
e2e_times = []
179174
for i in range(args.trials):
180175
duration_box = test_compiler_util.DurationBox(-1)
181-
with test_compiler_util.naive_timer(duration_box, synchronizer_func):
176+
with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
182177
model_call()
183178
print(
184179
f"Trial {i + 1}: e2e={duration_box.value:.4f} ms",
@@ -247,10 +242,27 @@ def transfer_to_float(origin_outputs):
247242
)
248243

249244

245+
def check_and_print_gpu_utilization(compiler):
246+
if paddle.device.is_compiled_with_cuda():
247+
device_id = int(paddle.device.get_device().split(":")[-1])
248+
device_count = paddle.device.cuda.device_count()
249+
gpu_util, mem_util = test_compiler_util.get_device_utilization(
250+
device_id, device_count, compiler.synchronize
251+
)
252+
if gpu_util is not None and mem_util is not None:
253+
print(
254+
f"Device status: gpu_id {device_id}, gpu_util {gpu_util:.2f}%, mem_util {mem_util:.2f}%",
255+
file=sys.stderr,
256+
flush=True,
257+
)
258+
259+
250260
def test_single_model(args):
251-
synchronizer_func = get_synchronizer_func(args)
252-
input_dict = get_input_dict(args)
253-
model = get_model(args)
261+
compiler = get_compiler_backend(args)
262+
check_and_print_gpu_utilization(compiler)
263+
264+
input_dict = get_input_dict(args.model_path)
265+
model = get_model(args.model_path)
254266
model.eval()
255267

256268
test_compiler_util.print_basic_config(
@@ -259,11 +271,12 @@ def test_single_model(args):
259271

260272
# Run on eager mode
261273
eager_success = False
274+
eager_time_stats = {}
262275
try:
263276
print("Run model in eager mode.", file=sys.stderr, flush=True)
264277
static_model = get_static_model(args, model)
265278
expected_out, eager_time_stats = measure_performance(
266-
lambda: static_model(**input_dict), args, synchronizer_func, profile=False
279+
lambda: static_model(**input_dict), args, compiler, profile=False
267280
)
268281
eager_success = True
269282
except Exception as e:
@@ -275,11 +288,13 @@ def test_single_model(args):
275288

276289
# Run on compiling mode
277290
compiled_success = False
291+
compiled_time_stats = {}
278292
try:
279293
print("Run model in compiled mode.", file=sys.stderr, flush=True)
280-
compiled_model = get_compiled_model(args, model)
294+
input_spec = get_input_spec(args.model_path)
295+
compiled_model = compiler(model, input_spec)
281296
compiled_out, compiled_time_stats = measure_performance(
282-
lambda: compiled_model(**input_dict), args, synchronizer_func, profile=False
297+
lambda: compiled_model(**input_dict), args, compiler, profile=False
283298
)
284299
compiled_success = True
285300
except Exception as e:
@@ -293,9 +308,9 @@ def test_single_model(args):
293308
if eager_success and compiled_success:
294309
check_outputs(args, expected_out, compiled_out)
295310

296-
test_compiler_util.print_times_and_speedup(
297-
args, eager_time_stats, compiled_time_stats
298-
)
311+
test_compiler_util.print_times_and_speedup(
312+
args, eager_time_stats, compiled_time_stats
313+
)
299314

300315

301316
def get_cmp_equal(expected_out, compiled_out):
@@ -366,20 +381,12 @@ def get_cmp_diff_count(expected_out, compiled_out, atol, rtol):
366381

367382

368383
def test_multi_models(args):
369-
test_samples = None
370-
if args.allow_list is not None:
371-
assert os.path.isfile(args.allow_list)
372-
graphnet_root = path_utils.get_graphnet_root()
373-
print(f"graphnet_root: {graphnet_root}", file=sys.stderr, flush=True)
374-
verified_samples = []
375-
with open(args.verified_samples_list_path, "r") as f:
376-
for line in f.readlines():
377-
test_samples.append(os.path.join(graphnet_root, line.strip()))
384+
test_samples = test_compiler_util.get_allow_samples(args.allow_list)
378385

379386
sample_idx = 0
380387
failed_samples = []
381388
for model_path in path_utils.get_recursively_model_path(args.model_path):
382-
if verified_samples is None or os.path.abspath(model_path) in verified_samples:
389+
if test_samples is None or os.path.abspath(model_path) in test_samples:
383390
print(
384391
f"[{sample_idx}] test_compiler, model_path: {model_path}",
385392
file=sys.stderr,
@@ -415,6 +422,7 @@ def test_multi_models(args):
415422
def main(args):
416423
assert os.path.isdir(args.model_path)
417424
assert args.compiler in {"cinn", "nope"}
425+
assert args.device in ["cuda", "dcu", "cpu"]
418426

419427
initalize_seed = 123
420428
set_seed(random_seed=initalize_seed)

graph_net/paddle/utils.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,10 +213,11 @@ def replay_tensor(info):
213213
).to(device)
214214
else:
215215
if mean is not None and std is not None:
216-
tensor = paddle.empty(shape=shape, dtype=dtype)
217-
paddle.nn.init.trunc_normal_(
218-
tensor=tensor, mean=mean, std=std, a=min_val, b=max_val
216+
tensor = paddle.empty(shape=shape, dtype="float32")
217+
initializer = paddle.nn.initializer.TruncatedNormal(
218+
mean=mean, std=std, a=min_val, b=max_val
219219
)
220+
initializer(tensor)
220221
return tensor.to(dtype).to(device)
221222
else:
222223
return (

0 commit comments

Comments
 (0)