Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 102 additions & 40 deletions .azure-pipelines/scripts/performance/check_performance.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,127 @@
import logging
import re
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional

LOG_DIR = "/auto-round/log_dir"
logging.basicConfig(level=logging.INFO, format="%(message)s")

LOG_DIR = Path("/auto-round/log_dir")
OUTPUT_BASE_DIR = Path("/auto-round/.azure-pipelines/scripts/performance")

def parse_tuning_time(log_file):
with open(log_file, "r") as f:
content = f.read()

pattern = r"tuning time ([0-9]+\.[0-9]+)"
match = re.search(pattern, content)
@dataclass
class QuantMetrics:
tuning_time_s: Optional[float] = None
peak_ram_gb: Optional[float] = None
peak_vram_gb: Optional[float] = None
output_size_gb: Optional[float] = None

if match:
elapsed = str_to_float(match.group(1))
return elapsed
return None

def get_dir_size_gb(path: Path) -> float:
if not path.exists() or not path.is_dir():
return 0.0

def str_to_float(value):
try:
return round(float(value), 4)
except ValueError:
return value
total_bytes = sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
return round(total_bytes / (1024**3), 4)


def get_tuning_time():
def parse_log_file(log_file: Path) -> QuantMetrics:
metrics = QuantMetrics()

if not log_file.exists():
logging.warning(f"Log file not found: {log_file}")
return metrics

content = log_file.read_text(encoding="utf-8")

# Use findall to capture all occurrences and take the most recent one.
time_matches = re.findall(r"tuning time ([0-9]+\.[0-9]+)", content)
if time_matches:
metrics.tuning_time_s = round(float(time_matches[-1]), 4)

ram_matches = re.findall(r"'peak_ram':\s*([\d.]+)\s*GB,\s*'peak_vram':\s*([\d.]+)\s*GB", content)
if ram_matches:
last_ram, last_vram = ram_matches[-1]
metrics.peak_ram_gb = round(float(last_ram), 4)
metrics.peak_vram_gb = round(float(last_vram), 4)

return metrics


def get_tuning_info() -> Dict[str, Dict[str, QuantMetrics]]:
summary = {}
model_list = ["Qwen/Qwen3-0.6B"]

for model in model_list:
summary[model] = {}
for test_mode in ["current", "baseline"]:
log_file = f"{LOG_DIR}/perf_test_{test_mode}.log"
print(f"Processing {log_file}...")
tuning_time = parse_tuning_time(log_file)
if tuning_time is not None:
summary[model][test_mode] = tuning_time
else:
summary[model][test_mode] = "N/A"
log_file = LOG_DIR / f"perf_test_{test_mode}.log"
output_dir = OUTPUT_BASE_DIR / test_mode

logging.info(f"Processing {log_file}...")

metrics = parse_log_file(log_file)
metrics.output_size_gb = get_dir_size_gb(output_dir)

summary[model][test_mode] = metrics

return summary


def compare_metric(
metric_name: str, current: Optional[float], baseline: Optional[float], tolerance: float = 0.1
) -> bool:
if current is None or baseline is None:
logging.error(f" [-] {metric_name}: Incomplete data (Current: {current}, Baseline: {baseline})")
return False

if baseline == 0:
logging.warning(f" [!] {metric_name}: Baseline is 0, cannot calculate ratio.")
return False

ratio = current / baseline
diff_percent = (ratio - 1) * 100

msg = f" [*] {metric_name:<20}: Current = {current:<8} | Baseline = {baseline:<8} (Diff: {diff_percent:+.2f}%)"

if 1.0 - tolerance <= ratio <= 1.0 + tolerance:
logging.info(f"{msg} -> PASS")
return True
else:
logging.error(f"{msg} -> FAIL")
return False


def check_performance():
status = True
summary = get_tuning_time()
for model, times in summary.items():
current_time = times.get("current", "N/A")
baseline_time = times.get("baseline", "N/A")
if current_time != "N/A" and baseline_time != "N/A":
print(f"{model}:\n Current = {current_time} seconds\n Baseline = {baseline_time} seconds")
ratio = current_time / baseline_time
if ratio < 0.9 or ratio > 1.1:
status = False
else:
print(f"{model}: Tuning time data is incomplete.")
status = False

if status:
print("Performance check passed: Current tuning times are within acceptable limits compared to baseline.")
summary = get_tuning_info()
all_passed = True

for model, modes in summary.items():
logging.info(f"\nEvaluating Model: {model}")
logging.info("-" * 60)

current: QuantMetrics = modes.get("current", QuantMetrics())
baseline: QuantMetrics = modes.get("baseline", QuantMetrics())

if not compare_metric("Tuning Time (s)", current.tuning_time_s, baseline.tuning_time_s, tolerance=0.1):
all_passed = False

if not compare_metric("Peak RAM (GB)", current.peak_ram_gb, baseline.peak_ram_gb, tolerance=0.1):
all_passed = False

if not compare_metric("Peak VRAM (GB)", current.peak_vram_gb, baseline.peak_vram_gb, tolerance=0.05):
all_passed = False

if not compare_metric("Output Size (GB)", current.output_size_gb, baseline.output_size_gb, tolerance=0.05):
all_passed = False

logging.info("=" * 60)
if all_passed:
logging.info("✅ Performance check passed: All metrics are within acceptable limits (±10%).")
else:
print("Performance check failed: Current tuning times exceed acceptable limits compared to baseline.")
logging.error("❌ Performance check failed: Current metrics exceed acceptable limits compared to baseline.")
sys.exit(1)


Expand Down
3 changes: 1 addition & 2 deletions .azure-pipelines/scripts/performance/perf_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,8 @@ function run_performance_test() {
test_mode=$1
cd /auto-round/.azure-pipelines/scripts/performance
local log_file="perf_test_${test_mode}.log"
rm -rf "saved" "${LOG_DIR}/${log_file}"
echo "##[group]run ${test_mode} performance test..."
auto-round --model_name ${model_name} --bits 4 --iters 200 --enable_torch_compile --device hpu --output_dir ./saved 2>&1 | tee -a "${LOG_DIR}/${log_file}"
auto-round --model_name ${model_name} --bits 4 --iters 200 --enable_torch_compile --device hpu --output_dir "./${test_mode}" 2>&1 | tee -a "${LOG_DIR}/${log_file}"
echo "##[endgroup]"
Comment on lines 39 to 42
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tee -a appends to the existing log file, and the script no longer removes prior logs/output directories. This can cause performance parsing to pick up stale runs and inflate output-size measurements across retries. Consider either deleting ${LOG_DIR}/${log_file} and ./${test_mode} before running, or switch tee to overwrite (no -a) and ensure output dirs are cleaned per run.

Copilot uses AI. Check for mistakes.
}

Expand Down
Loading