Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 100 additions & 40 deletions .azure-pipelines/scripts/performance/check_performance.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,125 @@
import re
import sys
import logging
from pathlib import Path
from dataclasses import dataclass
from typing import Optional, Dict

LOG_DIR = "/auto-round/log_dir"
logging.basicConfig(level=logging.INFO, format="%(message)s")

LOG_DIR = Path("/auto-round/log_dir")
OUTPUT_BASE_DIR = Path("/auto-round/.azure-pipelines/scripts/performance")

def parse_tuning_time(log_file):
with open(log_file, "r") as f:
content = f.read()

pattern = r"tuning time ([0-9]+\.[0-9]+)"
match = re.search(pattern, content)
@dataclass
class QuantMetrics:
tuning_time_s: Optional[float] = None
peak_ram_gb: Optional[float] = None
peak_vram_gb: Optional[float] = None
output_size_gb: Optional[float] = None

if match:
elapsed = str_to_float(match.group(1))
return elapsed
return None

def get_dir_size_gb(path: Path) -> float:
if not path.exists() or not path.is_dir():
return 0.0

def str_to_float(value):
try:
return round(float(value), 4)
except ValueError:
return value
total_bytes = sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
return round(total_bytes / (1024**3), 4)


def get_tuning_time():
def parse_log_file(log_file: Path) -> QuantMetrics:
metrics = QuantMetrics()

if not log_file.exists():
logging.warning(f"Log file not found: {log_file}")
return metrics

content = log_file.read_text(encoding="utf-8")

time_match = re.search(r"tuning time ([0-9]+\.[0-9]+)", content)
if time_match:
metrics.tuning_time_s = round(float(time_match.group(1)), 4)

ram_match = re.search(r"'peak_ram':\s*([\d.]+)GB?.*?,'peak_vram':\s*([\d.]+)GB?", content)
if ram_match:
metrics.peak_ram_gb = round(float(ram_match.group(1)), 4)
metrics.peak_vram_gb = round(float(ram_match.group(2)), 4)
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parse_log_file() uses re.search(...), which returns the first match in the file. If logs are appended (or contain multiple iterations), this will capture an older run rather than the most recent one. Prefer re.findall(...) and use the last match, or ensure the log file is overwritten/cleared before each run.

Suggested change
time_match = re.search(r"tuning time ([0-9]+\.[0-9]+)", content)
if time_match:
metrics.tuning_time_s = round(float(time_match.group(1)), 4)
ram_match = re.search(r"'peak_ram':\s*([\d.]+)GB?.*?,'peak_vram':\s*([\d.]+)GB?", content)
if ram_match:
metrics.peak_ram_gb = round(float(ram_match.group(1)), 4)
metrics.peak_vram_gb = round(float(ram_match.group(2)), 4)
# Use findall to capture all occurrences and take the most recent one.
time_matches = re.findall(r"tuning time ([0-9]+\.[0-9]+)", content)
if time_matches:
metrics.tuning_time_s = round(float(time_matches[-1]), 4)
ram_matches = re.findall(
r"'peak_ram':\s*([\d.]+)GB?.*?,'peak_vram':\s*([\d.]+)GB?", content
)
if ram_matches:
last_ram, last_vram = ram_matches[-1]
metrics.peak_ram_gb = round(float(last_ram), 4)
metrics.peak_vram_gb = round(float(last_vram), 4)

Copilot uses AI. Check for mistakes.

Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The peak_ram/peak_vram regex is too strict for the log format produced by PeakMemory.get_summary() (it includes a space after the comma: ..., 'peak_vram': ...). As written, this will likely never match and will leave RAM/VRAM metrics as None, failing the check. Update the pattern to allow whitespace after commas and consider handling the multi-device peak_vram dict case (e.g., choose max value).

Suggested change
ram_match = re.search(r"'peak_ram':\s*([\d.]+)GB?.*?,'peak_vram':\s*([\d.]+)GB?", content)
if ram_match:
metrics.peak_ram_gb = round(float(ram_match.group(1)), 4)
metrics.peak_vram_gb = round(float(ram_match.group(2)), 4)
# Parse peak RAM independently of VRAM and whitespace formatting
ram_match = re.search(r"'peak_ram':\s*([\d.]+)GB?", content)
if ram_match:
metrics.peak_ram_gb = round(float(ram_match.group(1)), 4)
# Parse peak VRAM; handle both scalar and multi-device dict formats
vram_simple_match = re.search(r"'peak_vram':\s*([\d.]+)GB?", content)
if vram_simple_match:
metrics.peak_vram_gb = round(float(vram_simple_match.group(1)), 4)
else:
# Example dict format: 'peak_vram': {'cuda:0': 1.2GB, 'cuda:1': 1.5GB}
vram_dict_match = re.search(r"'peak_vram':\s*\{([^}]+)\}", content)
if vram_dict_match:
vram_body = vram_dict_match.group(1)
values = re.findall(r"([\d.]+)GB?", vram_body)
if values:
max_vram = max(float(v) for v in values)
metrics.peak_vram_gb = round(max_vram, 4)

Copilot uses AI. Check for mistakes.
return metrics


def get_tuning_info() -> Dict[str, Dict[str, QuantMetrics]]:
summary = {}
model_list = ["Qwen/Qwen3-0.6B"]

for model in model_list:
summary[model] = {}
for test_mode in ["current", "baseline"]:
log_file = f"{LOG_DIR}/perf_test_{test_mode}.log"
print(f"Processing {log_file}...")
tuning_time = parse_tuning_time(log_file)
if tuning_time is not None:
summary[model][test_mode] = tuning_time
else:
summary[model][test_mode] = "N/A"
log_file = LOG_DIR / f"perf_test_{test_mode}.log"
output_dir = OUTPUT_BASE_DIR / test_mode

logging.info(f"Processing {log_file}...")

metrics = parse_log_file(log_file)
metrics.output_size_gb = get_dir_size_gb(output_dir)

summary[model][test_mode] = metrics

return summary


def compare_metric(
metric_name: str, current: Optional[float], baseline: Optional[float], tolerance: float = 0.1
) -> bool:
if current is None or baseline is None:
logging.error(f" [-] {metric_name}: Incomplete data (Current: {current}, Baseline: {baseline})")
return False

if baseline == 0:
logging.warning(f" [!] {metric_name}: Baseline is 0, cannot calculate ratio.")
return False

ratio = current / baseline
diff_percent = (ratio - 1) * 100

msg = f" [*] {metric_name:<20}: Current = {current:<8} | Baseline = {baseline:<8} (Diff: {diff_percent:+.2f}%)"

if 1.0 - tolerance <= ratio <= 1.0 + tolerance:
logging.info(f"{msg} -> PASS")
return True
else:
logging.error(f"{msg} -> FAIL")
return False


def check_performance():
status = True
summary = get_tuning_time()
for model, times in summary.items():
current_time = times.get("current", "N/A")
baseline_time = times.get("baseline", "N/A")
if current_time != "N/A" and baseline_time != "N/A":
print(f"{model}:\n Current = {current_time} seconds\n Baseline = {baseline_time} seconds")
ratio = current_time / baseline_time
if ratio < 0.9 or ratio > 1.1:
status = False
else:
print(f"{model}: Tuning time data is incomplete.")
status = False

if status:
print("Performance check passed: Current tuning times are within acceptable limits compared to baseline.")
summary = get_tuning_info()
all_passed = True

for model, modes in summary.items():
logging.info(f"\nEvaluating Model: {model}")
logging.info("-" * 60)

current: QuantMetrics = modes.get("current", QuantMetrics())
baseline: QuantMetrics = modes.get("baseline", QuantMetrics())

if not compare_metric("Tuning Time (s)", current.tuning_time_s, baseline.tuning_time_s, tolerance=0.1):
all_passed = False

if not compare_metric("Peak RAM (GB)", current.peak_ram_gb, baseline.peak_ram_gb, tolerance=0.1):
all_passed = False

if not compare_metric("Peak VRAM (GB)", current.peak_vram_gb, baseline.peak_vram_gb, tolerance=0.05):
all_passed = False

if not compare_metric("Output Size (GB)", current.output_size_gb, baseline.output_size_gb, tolerance=0.01):
all_passed = False

logging.info("=" * 60)
if all_passed:
logging.info("✅ Performance check passed: All metrics are within acceptable limits (±10%).")
else:
print("Performance check failed: Current tuning times exceed acceptable limits compared to baseline.")
logging.error("❌ Performance check failed: Current metrics exceed acceptable limits compared to baseline.")
sys.exit(1)


Expand Down
3 changes: 1 addition & 2 deletions .azure-pipelines/scripts/performance/perf_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,8 @@ function run_performance_test() {
test_mode=$1
cd /auto-round/.azure-pipelines/scripts/performance
local log_file="perf_test_${test_mode}.log"
rm -rf "saved" "${LOG_DIR}/${log_file}"
echo "##[group]run ${test_mode} performance test..."
auto-round --model_name ${model_name} --bits 4 --iters 200 --enable_torch_compile --device hpu --output_dir ./saved 2>&1 | tee -a "${LOG_DIR}/${log_file}"
auto-round --model_name ${model_name} --bits 4 --iters 200 --enable_torch_compile --device hpu --output_dir "./${test_mode}" 2>&1 | tee -a "${LOG_DIR}/${log_file}"
echo "##[endgroup]"
Comment on lines 39 to 42
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tee -a appends to the existing log file, and the script no longer removes prior logs/output directories. This can cause performance parsing to pick up stale runs and inflate output-size measurements across retries. Consider either deleting ${LOG_DIR}/${log_file} and ./${test_mode} before running, or switch tee to overwrite (no -a) and ensure output dirs are cleaned per run.

Copilot uses AI. Check for mistakes.
}

Expand Down
Loading