Skip to content

Commit fe57cf8

Browse files
committed
Simplify scan_all_folders and add .txt file support
- Remove subdirectory scanning logic, only handle two cases: (1) single log file → one curve (2) directory with multiple log files → each log file becomes a curve - Add support for .txt files in addition to .log files - Further simplify code by extracting duplicate logic and removing unnecessary code - Move speedup pattern matching earlier in parse_logs_to_data for better flow - Use loops to handle e2e/gpu processing to eliminate code duplication
1 parent 81618f8 commit fe57cf8

File tree

1 file changed

+48
-85
lines changed

1 file changed

+48
-85
lines changed

graph_net/analysis_util.py

Lines changed: 48 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -253,13 +253,16 @@ def parse_logs_to_data(log_file: str) -> list:
253253
data["correctness"][key.strip()] = values
254254
continue
255255

256+
# Check for speedup
257+
speedup_match = patterns["speedup"].search(line)
258+
if speedup_match:
259+
key, value_str = speedup_match.groups()
260+
data["performance"]["speedup"][key.strip()] = float(value_str)
261+
continue
262+
256263
# Look for the status, and if it's "failed", look ahead to the next line.
257264
result_status_match = patterns["result_status"].search(line)
258265
if not result_status_match:
259-
speedup_match = patterns["speedup"].search(line)
260-
if speedup_match:
261-
key, value_str = speedup_match.groups()
262-
data["performance"]["speedup"][key.strip()] = float(value_str)
263266
continue
264267

265268
status = result_status_match.group(1).strip()
@@ -291,33 +294,24 @@ def parse_logs_to_data(log_file: str) -> list:
291294
samples = []
292295
for run_key, data in all_runs_data.items():
293296
try:
297+
speedup_dict = data["performance"].get("speedup", {})
298+
294299
# Build result field with status and speedup (for compatibility with log2json output format)
295-
if data["result"]["status"] == "success":
300+
if data["result"]["status"] == "success" and speedup_dict:
296301
speedup_data = {}
297-
if "e2e" in data["performance"]["speedup"]:
298-
e2e_value = data["performance"]["speedup"]["e2e"]
299-
speedup_data["e2e"] = {"mean": e2e_value}
300-
if "gpu" in data["performance"]["speedup"]:
301-
gpu_value = data["performance"]["speedup"]["gpu"]
302-
speedup_data["gpu"] = {"mean": gpu_value}
302+
for key in ["e2e", "gpu"]:
303+
if key in speedup_dict:
304+
speedup_data[key] = {"mean": speedup_dict[key]}
303305
if speedup_data:
304306
data["result"]["speedup"] = speedup_data
305307

306-
# Ensure performance.speedup.e2e is a direct value (not nested dict)
308+
# Ensure performance.speedup.e2e/gpu are direct values (not nested dict)
307309
# This is required by calculate_s_scores which uses performance_data.get("speedup", {}).get("e2e")
308-
speedup_dict = data["performance"].get("speedup")
309-
if not speedup_dict:
310-
samples.append(data)
311-
continue
312-
313-
if "e2e" in speedup_dict:
314-
e2e_val = speedup_dict["e2e"]
315-
if isinstance(e2e_val, dict) and "mean" in e2e_val:
316-
speedup_dict["e2e"] = e2e_val["mean"]
317-
if "gpu" in speedup_dict:
318-
gpu_val = speedup_dict["gpu"]
319-
if isinstance(gpu_val, dict) and "mean" in gpu_val:
320-
speedup_dict["gpu"] = gpu_val["mean"]
310+
for key in ["e2e", "gpu"]:
311+
if key in speedup_dict:
312+
val = speedup_dict[key]
313+
if isinstance(val, dict) and "mean" in val:
314+
speedup_dict[key] = val["mean"]
321315

322316
samples.append(data)
323317

@@ -335,9 +329,9 @@ def parse_logs_to_data(log_file: str) -> list:
335329
def scan_all_folders(benchmark_path: str) -> dict:
336330
"""
337331
Unified entry point that supports log files and directories:
338-
- If benchmark_path is a log file → parse it directly and return data as a single curve.
339-
- If benchmark_path is a directory → scan for .log files, each log file becomes a curve.
340-
If no .log files found in root, scan subdirectories (each subdirectory becomes a curve).
332+
- If benchmark_path is a log file (.log or .txt) → parse it directly and return data as a single curve.
333+
- If benchmark_path is a directory → scan for .log and .txt files in the directory,
334+
each log file becomes a curve.
341335
Returns dict[curve_name] -> list_of_samples
342336
"""
343337
# Handle single log file
@@ -366,66 +360,38 @@ def scan_all_folders(benchmark_path: str) -> dict:
366360

367361
print(f"Scanning '{benchmark_path}' ...")
368362

369-
# Find .log files in the root directory
370-
log_files = [
371-
f
372-
for f in os.listdir(benchmark_path)
373-
if os.path.isfile(os.path.join(benchmark_path, f)) and f.endswith(".log")
374-
]
375-
376-
# Process root-level log files
377-
if log_files:
378-
all_results = {}
379-
print(f" - Found {len(log_files)} log file(s) → each becomes a curve.")
380-
for log_file in sorted(log_files):
381-
log_file_path = os.path.join(benchmark_path, log_file)
382-
samples = parse_logs_to_data(log_file_path)
383-
if not samples:
384-
continue
385-
386-
curve_name = os.path.splitext(log_file)[0] or "benchmark"
387-
all_results[curve_name] = samples
388-
print(f" - Curve '{curve_name}': {len(samples)} samples.")
389-
390-
if not all_results:
391-
print(" - No valid data found in any log file.")
392-
return {}
393-
394-
print(f"Total curves loaded: {len(all_results)}")
395-
return all_results
396-
397-
# Fall back to subdirectories
398-
all_results = {}
399-
print(" - No log files found in root → scanning sub-folders.")
400-
for entry in os.listdir(benchmark_path):
401-
folder_full_path = os.path.join(benchmark_path, entry)
402-
if not os.path.isdir(folder_full_path):
403-
continue
404-
405-
# Find log files in subdirectory
406-
sub_log_files = [
363+
# Find .log and .txt files in the directory
364+
log_files = sorted(
365+
[
407366
f
408-
for f in os.listdir(folder_full_path)
409-
if os.path.isfile(os.path.join(folder_full_path, f)) and f.endswith(".log")
367+
for f in os.listdir(benchmark_path)
368+
if os.path.isfile(os.path.join(benchmark_path, f))
369+
and f.endswith((".log", ".txt"))
410370
]
411-
if not sub_log_files:
412-
continue
371+
)
413372

414-
# Parse and combine log files from subdirectory
415-
combined_samples = []
416-
for log_file in sub_log_files:
417-
log_file_path = os.path.join(folder_full_path, log_file)
418-
samples = parse_logs_to_data(log_file_path)
419-
combined_samples.extend(samples)
373+
if not log_files:
374+
print(" - No log files (.log or .txt) found in directory.")
375+
return {}
420376

421-
if not combined_samples:
377+
# Process log files, each becomes a curve
378+
all_results = {}
379+
print(f" - Found {len(log_files)} log file(s) → each becomes a curve.")
380+
for log_file in log_files:
381+
log_file_path = os.path.join(benchmark_path, log_file)
382+
samples = parse_logs_to_data(log_file_path)
383+
if not samples:
422384
continue
423385

424-
all_results[entry] = combined_samples
425-
print(f" - Folder '{entry}': {len(combined_samples)} samples from log files.")
386+
curve_name = os.path.splitext(log_file)[0] or "benchmark"
387+
all_results[curve_name] = samples
388+
print(f" - Curve '{curve_name}': {len(samples)} samples.")
389+
390+
if not all_results:
391+
print(" - No valid data found in any log file.")
392+
return {}
426393

427-
if all_results:
428-
print(f"Total folders loaded: {len(all_results)}")
394+
print(f"Total curves loaded: {len(all_results)}")
429395
return all_results
430396

431397

@@ -566,10 +532,7 @@ def print_stat_info(
566532

567533
# Determine the true state of the current sample (for statistics and S curve)
568534
is_correct = False
569-
if fail_type is not None:
570-
# Already has a failure type, skip correctness check
571-
pass
572-
else:
535+
if fail_type is None:
573536
datatype_data = performance_data.get("datatype", {})
574537
eager_dtypes = datatype_data.get("eager", [])
575538
compiled_dtypes = datatype_data.get("compiled", [])

0 commit comments

Comments
 (0)