Skip to content

Commit 4e84a5c

Browse files
refactor(plot_violin): read data from log files instead of JSON (#367)
* Update * Upadte * refactor(plot_violin): read data from log files instead of JSON - Change data source from JSON files in subdirectories to direct log file parsing - Use scan_all_folders to support both single log file and directory with multiple log files - Optimize code structure: eliminate duplicate logic and intermediate variables - Merge data processing flow to reduce loop iterations - Improve variable naming semantics: curve_name -> category_name, speedup_data -> speedup_raw/speedup_numeric - Unify to use 'e2e' speedup only, consistent with core logic (no fallback to 'gpu') - Reduce data processing code from 54 lines to 17 lines * remove log2json logic, which can cause conflict * Add else branches to all if statements in plot_violin.py with explanatory comments --------- Co-authored-by: JewelRoam <[email protected]>
1 parent d7cf909 commit 4e84a5c

File tree

5 files changed

+112
-394
lines changed

5 files changed

+112
-394
lines changed

graph_net/analysis_util.py

Lines changed: 3 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,88 +1,11 @@
11
import os
2-
import json
32
import re
43
import numpy as np
54
from scipy.stats import gmean
65
from collections import OrderedDict, defaultdict
76
from graph_net.config.datatype_tolerance_config import get_precision
87

98

10-
def extract_speedup_data_from_subdirs(benchmark_path: str) -> dict:
11-
"""
12-
Reads speedup data from JSON files within each immediate subdirectory of the benchmark_path.
13-
Each subdirectory is treated as a separate category.
14-
Returns a dictionary mapping {subdir_name: [speedup_values]}.
15-
"""
16-
data_by_subdir = defaultdict(list)
17-
18-
if not os.path.exists(benchmark_path):
19-
print(f"Error: Path does not exist -> {benchmark_path}")
20-
return {}
21-
22-
try:
23-
subdirs = [
24-
d
25-
for d in os.listdir(benchmark_path)
26-
if os.path.isdir(os.path.join(benchmark_path, d))
27-
]
28-
except FileNotFoundError:
29-
print(f"Error: Benchmark path not found -> {benchmark_path}")
30-
return {}
31-
32-
if not subdirs:
33-
print(f"Warning: No subdirectories found in -> {benchmark_path}")
34-
return {}
35-
36-
print(f"Found subdirectories to process: {', '.join(subdirs)}")
37-
38-
for subdir_name in subdirs:
39-
current_dir_path = os.path.join(benchmark_path, subdir_name)
40-
# Using scan_all_folders and load_one_folder could be an alternative,
41-
# but os.walk is also robust for nested directories if needed in the future.
42-
for root, _, files in os.walk(current_dir_path):
43-
for file in files:
44-
if not file.endswith(".json"):
45-
continue
46-
47-
json_file = os.path.join(root, file)
48-
try:
49-
with open(json_file, "r") as f:
50-
data = json.load(f)
51-
performance = data.get("performance", {})
52-
if not performance:
53-
continue
54-
55-
speedup_data = performance.get("speedup")
56-
if isinstance(speedup_data, dict):
57-
# Prioritize 'e2e' speedup, fallback to 'gpu'
58-
if "e2e" in speedup_data:
59-
data_by_subdir[subdir_name].append(speedup_data["e2e"])
60-
elif "gpu" in speedup_data:
61-
data_by_subdir[subdir_name].append(speedup_data["gpu"])
62-
elif isinstance(speedup_data, (float, int)):
63-
data_by_subdir[subdir_name].append(speedup_data)
64-
65-
except (json.JSONDecodeError, KeyError) as e:
66-
print(
67-
f"Warning: Failed to read or parse file -> {json_file}, Error: {e}"
68-
)
69-
continue
70-
71-
return data_by_subdir
72-
73-
74-
def load_json_file(filepath: str) -> dict:
75-
"""
76-
Safely load a JSON file and return data, return an empty dictionary if loading fails.
77-
"""
78-
try:
79-
with open(filepath, "r", encoding="utf-8") as f:
80-
return json.load(f)
81-
except (json.JSONDecodeError, KeyError) as e:
82-
print(f" Warning: Could not process file {filepath}. Error: {e}")
83-
return {}
84-
85-
869
def detect_sample_error_code(log_text: str) -> str:
8710
"""
8811
Detect the error code for a single sample from log text.
@@ -154,8 +77,8 @@ def parse_logs_to_data(log_file: str) -> list:
15477
Parse a structured log file generated by the benchmark script and
15578
return a list of data dictionaries (one per model-compiler run).
15679
157-
This function directly parses log files without generating intermediate JSON files.
158-
It automatically handles both Paddle (with subgraph) and PyTorch (without subgraph) samples.
80+
This function directly parses log files,
81+
handling both Paddle (with subgraph) and PyTorch (without subgraph) samples.
15982
16083
Args:
16184
log_file: Path to the benchmark log file
@@ -229,8 +152,7 @@ def parse_logs_to_data(log_file: str) -> list:
229152
performance_match = patterns["performance"].search(line)
230153
if performance_match:
231154
key, value_str = performance_match.groups()
232-
# The performance value is a JSON string, so we load it
233-
data["performance"][key.strip()] = json.loads(value_str)
155+
data["performance"][key.strip()] = value_str.strip()
234156
continue
235157

236158
datatype_match = patterns["datatype"].search(line)
@@ -409,7 +331,6 @@ def get_correctness(dtype: str, t: int, correctness_data: dict, index: int) -> b
409331
if atol == 0 and rtol == 0:
410332
metric_key_to_check = "[equal]"
411333
else:
412-
# Use .2E format to ensure two decimal places and use uppercase E to match JSON log format
413334
metric_key_to_check = f"[all_close_atol_{atol:.2E}_rtol_{rtol:.2E}]"
414335

415336
result = correctness_data.get(metric_key_to_check)

graph_net/log2json.py

Lines changed: 0 additions & 202 deletions
This file was deleted.

0 commit comments

Comments
 (0)