diff --git a/.github/workflows/bencher-ab.yml b/.github/workflows/bencher-ab.yml index 39dbfb1990f..efa50a06d60 100644 --- a/.github/workflows/bencher-ab.yml +++ b/.github/workflows/bencher-ab.yml @@ -45,74 +45,22 @@ jobs: - name: Build and run benchmarks run: | git config --global --add safe.directory /__w/CCF/CCF - mkdir build - cd build - cmake -GNinja -DWORKER_THREADS=2 .. - ninja - # Microbenchmarks - ./tests.sh -VV -L benchmark - # End to end performance tests - ./tests.sh -VV -L perf -C perf - # Convert microbenchmark output to bencher json - source env/bin/activate - PYTHONPATH=../tests python convert_pico_to_bencher.py + ./scripts/bench-ab.sh run --results-dir build/bench-ab - - name: Upload PR results + - name: Upload benchmark logs uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 with: - name: benchmark-pr-${{ github.run_id }} - path: build/bencher.json + name: benchmark-pr-logs-${{ github.run_id }} + path: build/bench-ab/logs/*.log + if-no-files-found: ignore retention-days: 7 + if: success() || failure() - benchmark_main: - name: Benchmark Main - runs-on: - [ - self-hosted, - 1ES.Pool=gha-vmss-d16av6-ci, - "JobId=bab_benchmark_main-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}", - ] - if: *check_trigger_conditions - container: - image: mcr.microsoft.com/azurelinux/base/core:3.0 - options: --user root - steps: - - name: Setup container dependencies - run: | - gpg --import /etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY - tdnf -y update && tdnf -y install ca-certificates git - - - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 - with: - ref: main - fetch-depth: 0 - - - name: Install dependencies - run: ./scripts/setup-ci.sh - - - name: Confirm platform - run: python3 tests/infra/platform_detection.py virtual - - - name: Build and run benchmarks - run: | - git config --global --add safe.directory /__w/CCF/CCF - mkdir build - cd build - cmake -GNinja -DWORKER_THREADS=2 .. - ninja - # Microbenchmarks - ./tests.sh -VV -L benchmark - # End to end performance tests - ./tests.sh -VV -L perf -C perf - # Convert microbenchmark output to bencher json - source env/bin/activate - PYTHONPATH=../tests python convert_pico_to_bencher.py - - - name: Upload main results + - name: Upload PR results uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 with: - name: benchmark-main-${{ github.run_id }} - path: build/bencher.json + name: benchmark-pr-${{ github.run_id }} + path: build/bench-ab/bencher-pr-*.json retention-days: 7 compare: @@ -126,35 +74,54 @@ jobs: container: image: mcr.microsoft.com/azurelinux/base/core:3.0 options: --user root - needs: [benchmark_pr, benchmark_main] + needs: [benchmark_pr] if: *check_trigger_conditions steps: - name: Setup container dependencies run: | gpg --import /etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY - tdnf -y update && tdnf -y install ca-certificates git + tdnf -y update && tdnf -y install ca-certificates git gh - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 - name: Download artifacts uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 with: - pattern: benchmark-*-${{ github.run_id }} - merge-multiple: false + pattern: benchmark-pr-${{ github.run_id }} + path: benchmark-pr-${{ github.run_id }} + merge-multiple: true - - name: Generate comparison + - name: Restore main perf results run: | - echo "# Benchmark Comparison: main vs PR" > report.md - echo "" >> report.md - echo "**PR Commit:** \`${{ github.event.pull_request.head.sha }}\`" >> report.md - echo "**Base Commit:** \`${{ github.event.pull_request.base.sha }}\`" >> report.md - echo "**Run ID:** [${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" >> report.md - echo "" >> report.md - echo '```' >> report.md - python3 scripts/compare_bencher_ab.py \ - benchmark-main-${{ github.run_id }}/bencher.json \ - benchmark-pr-${{ github.run_id }}/bencher.json \ - --label1 "main" --label2 "PR" >> report.md - echo '```' >> report.md + set -euo pipefail + mkdir -p perf + rm -f perf/*.json + git config --global --add safe.directory "$GITHUB_WORKSPACE" + run_ids=$(gh api "repos/${{ github.repository }}/actions/artifacts?name=perf-bench-virtual-main&per_page=100" \ + --jq '[.artifacts[] | select(.expired == false)] | sort_by(.created_at) | reverse | .[0:10] | .[].workflow_run.id') + restored=false + for run_id in $run_ids; do + download_dir="prev_artifact/$run_id" + mkdir -p "$download_dir" + if gh run download "$run_id" --name "perf-bench-virtual-main" --dir "$download_dir"; then + if [[ -n "$(find "$download_dir" -name '*.json' -print -quit)" ]]; then + restored=true + fi + fi + done + if [[ "$restored" != "true" ]]; then + echo "No main perf artifacts restored" + exit 1 + fi + find prev_artifact -name '*.json' -exec cp {} perf/ \; + env: + GH_TOKEN: ${{ github.token }} - cat report.md >> $GITHUB_STEP_SUMMARY + - name: Generate comparison + run: | + ./scripts/bench-ab.sh report \ + --main-perf-dir perf \ + --results-dir benchmark-pr-${{ github.run_id }} \ + --label PR \ + --output report.md \ + --summary diff --git a/scripts/bench-ab.sh b/scripts/bench-ab.sh new file mode 100755 index 00000000000..62626073bda --- /dev/null +++ b/scripts/bench-ab.sh @@ -0,0 +1,209 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the Apache 2.0 License. + +set -euo pipefail + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR=$( dirname "$SCRIPT_DIR" ) + +usage() { + cat <<'EOF' +Usage: scripts/bench-ab.sh [run|report|local] [options] + +No command means local: restore main perf, run benchmarks, and write a report. + +Options: + --build-dir DIR Build directory. Default: build-bench-ab + --results-dir DIR Benchmark JSON directory. Default: build/bench-ab + --main-perf-dir DIR Main-branch perf JSON directory. Default: build/bench-ab/main + --output FILE Markdown report path. Default: RESULTS_DIR/report.md + --iterations N Benchmark repetitions. Default: 3 + --label LABEL Report label. Default: run + --summary Append report to GITHUB_STEP_SUMMARY +EOF +} + +abs_path() { + if [[ "$1" == /* ]]; then + echo "$1" + else + echo "$ROOT_DIR/$1" + fi +} + +cmd=local +case "${1:-}" in + -h|--help) + usage + exit 0 + ;; + run|report|local) + cmd=$1 + shift + ;; +esac + +build_dir=build-bench-ab +results_dir=build/bench-ab +main_perf_dir=build/bench-ab/main +output_file= +iterations=3 +label=run +summary=false +repository=${GITHUB_REPOSITORY:-microsoft/CCF} +main_artifact=perf-bench-virtual-main + +while [[ $# -gt 0 ]]; do + opt=$1 + case "$opt" in + --build-dir) build_dir=${2:?$opt requires a value}; shift 2 ;; + --results-dir) results_dir=${2:?$opt requires a value}; shift 2 ;; + --main-perf-dir) main_perf_dir=${2:?$opt requires a value}; shift 2 ;; + --output) output_file=${2:?$opt requires a value}; shift 2 ;; + --iterations) iterations=${2:?$opt requires a value}; shift 2 ;; + --label) label=${2:?$opt requires a value}; shift 2 ;; + --summary) summary=true; shift ;; + *) echo "Unknown argument: $opt" >&2; usage >&2; exit 1 ;; + esac +done + +if ! [[ "$iterations" =~ ^[1-9][0-9]*$ ]]; then + echo "--iterations must be a positive integer" >&2 + exit 1 +fi + +output_file=${output_file:-"$results_dir/report.md"} + +build_dir=$(abs_path "$build_dir") +results_dir=$(abs_path "$results_dir") +main_perf_dir=$(abs_path "$main_perf_dir") +output_file=$(abs_path "$output_file") + +restore_main_perf() { + mkdir -p "$main_perf_dir" + tmp_dir=$(mktemp -d) + restored=false + + echo "Restoring latest main perf results..." + if command -v gh >/dev/null 2>&1; then + run_ids=$(gh api "repos/$repository/actions/artifacts?name=$main_artifact&per_page=100" \ + --jq '[.artifacts[] | select(.expired == false)] | sort_by(.created_at) | reverse | .[0:10] | .[].workflow_run.id') || run_ids= + + for run_id in $run_ids; do + download_dir="$tmp_dir/$run_id" + mkdir -p "$download_dir" + if gh run download "$run_id" --repo "$repository" --name "$main_artifact" --dir "$download_dir" >/dev/null 2>&1; then + [[ -n "$(find "$download_dir" -name '*.json' -print -quit)" ]] && restored=true + fi + done + elif command -v curl >/dev/null 2>&1 && command -v unzip >/dev/null 2>&1; then + artifacts_json="$tmp_dir/artifacts.json" + if curl -fsSL \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/$repository/actions/artifacts?name=$main_artifact&per_page=100" \ + -o "$artifacts_json"; then + python3 - "$artifacts_json" > "$tmp_dir/urls" <<'PY' +import json +import sys + +with open(sys.argv[1], encoding="utf-8") as f: + data = json.load(f) + +artifacts = [ + artifact + for artifact in data.get("artifacts", []) + if not artifact.get("expired") +] +for artifact in sorted( + artifacts, key=lambda artifact: artifact.get("created_at", ""), reverse=True +)[:10]: + print(artifact["archive_download_url"]) +PY + + index=0 + while IFS= read -r url; do + index=$((index + 1)) + archive="$tmp_dir/$index.zip" + download_dir="$tmp_dir/$index" + mkdir -p "$download_dir" + if curl -fsSL -L "$url" -o "$archive" && + unzip -q "$archive" -d "$download_dir"; then + [[ -n "$(find "$download_dir" -name '*.json' -print -quit)" ]] && restored=true + fi + done < "$tmp_dir/urls" + fi + fi + + if [[ "$restored" == "true" ]]; then + rm -f "$main_perf_dir"/*.json + find "$tmp_dir" -mindepth 2 -name '*.json' -exec cp {} "$main_perf_dir"/ \; + echo "Main perf results saved in $main_perf_dir" + else + echo "Could not restore main perf results; using $main_perf_dir" >&2 + fi + rm -rf "$tmp_dir" +} + +run_benchmarks() { + mkdir -p "$build_dir" "$results_dir" + rm -f "$results_dir"/bencher-pr-*.json + log_dir="$results_dir/logs" + mkdir -p "$log_dir" + rm -f "$log_dir"/*.log + + build_log="$log_dir/build.log" + echo "Building..." + { + cmake -S "$ROOT_DIR" -B "$build_dir" -GNinja -DWORKER_THREADS=2 && + cmake --build "$build_dir" + } > "$build_log" 2>&1 || { + echo "Build failed. See $build_log" >&2 + exit 1 + } + + pushd "$build_dir" >/dev/null + for ((i = 1; i <= iterations; i++)); do + echo "Running $i/$iterations..." + run_log="$log_dir/iteration-${i}.log" + rm -f bencher.json + { + ./tests.sh -VV -L benchmark && + ./tests.sh -VV -L perf -C perf && + PYTHONPATH="$ROOT_DIR/tests" env/bin/python convert_pico_to_bencher.py + } > "$run_log" 2>&1 || { + echo "Iteration $i failed. See $run_log" >&2 + exit 1 + } + result="$results_dir/bencher-pr-${i}.json" + mv bencher.json "$result" + done + popd >/dev/null + + echo "Benchmark results saved in $results_dir" + echo "Logs saved in $log_dir" +} + +write_report() { + mkdir -p "$( dirname "$output_file" )" + python3 "$ROOT_DIR/scripts/compare_bencher_ab.py" \ + "$main_perf_dir" \ + "$results_dir" \ + --label2 "$label" > "$output_file" + + if [[ "$summary" == "true" ]]; then + if [[ -z "${GITHUB_STEP_SUMMARY:-}" ]]; then + echo "GITHUB_STEP_SUMMARY is not set" >&2 + exit 1 + fi + cat "$output_file" >> "$GITHUB_STEP_SUMMARY" + fi + + echo "Report saved in $output_file" +} + +case "$cmd" in + run) run_benchmarks ;; + report) write_report ;; + local) restore_main_perf; run_benchmarks; write_report; echo "Done." ;; +esac diff --git a/scripts/compare_bencher_ab.py b/scripts/compare_bencher_ab.py index 2f27bec5962..35e33d9e599 100755 --- a/scripts/compare_bencher_ab.py +++ b/scripts/compare_bencher_ab.py @@ -1,16 +1,23 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the Apache 2.0 License. +import argparse import json +import os import sys -import argparse -from typing import Dict +from typing import Dict, List + +from perf_report import METRIC_GROUPS +from perf_report import benchmarks_with_metric, list_perf_files, load_perf_data +from perf_report import render_metric_group +from perf_report import render_runs_table +from perf_report import jobid_sort_key -METADATA_KEY = "__metadata" +MAIN_HISTORY_POINTS = 10 def load_bencher_file(filepath: str) -> Dict: - """Load a bencher.json file""" + """Load a bencher.json file.""" try: with open(filepath, "r") as f: return json.load(f) @@ -22,231 +29,87 @@ def load_bencher_file(filepath: str) -> Dict: sys.exit(1) -def extract_metrics(data: Dict) -> Dict[str, float]: - """Extract metrics from bencher data into a flat dictionary""" - metrics = {} - for test_name, test_data in data.items(): - if test_name == METADATA_KEY: - continue - for metric_type, metric_data in test_data.items(): - key = f"{test_name} - {metric_type}" - if isinstance(metric_data, dict) and "value" in metric_data: - metrics[key] = metric_data["value"] - else: - metrics[key] = metric_data - return metrics - - -def is_higher_better(metric_name: str) -> bool: - """Determine if higher values are better for this metric""" - metric_lower = metric_name.lower() - - # Higher is better for these metrics - if any(keyword in metric_lower for keyword in ["throughput", "rate", "queries"]): - return True - - # Lower is better for these metrics - if any(keyword in metric_lower for keyword in ["latency", "memory"]): - return False - - # Default assumption: higher is better (for most performance metrics) - return True - - -def create_diverging_bar( - change_percent: float, metric_name: str, width: int = 40 -) -> str: - """Create a diverging bar chart centered at 0 using block characters""" - if change_percent == "N/A" or change_percent == "∞": - return " " * (width // 2) + "|" + " " * (width // 2) + " N/A" - - try: - # Parse the percentage - if isinstance(change_percent, str): - change_val = float(change_percent.replace("%", "").replace("+", "")) - else: - change_val = change_percent - except (ValueError, AttributeError): - return " " * (width // 2) + "|" + " " * (width // 2) + " N/A" - - # Center position - center = width // 2 - - # Scale the change to fit the bar width (max ±50% uses full width) - max_change = 50.0 # Cap at ±50% for reasonable scaling - clamped_change = max(-max_change, min(max_change, change_val)) - - # Calculate bar length (half width = max extent) - bar_length = int(abs(clamped_change) / max_change * center) - - # Determine if this change is actually good or bad - higher_is_better = is_higher_better(metric_name) - is_improvement = (change_val > 0 and higher_is_better) or ( - change_val < 0 and not higher_is_better - ) - - if abs(change_val) < 2: # No significant change - bar = " " * center + "|" + " " * center - return bar + f" {change_val:+.1f}%" - elif is_improvement: # This is actually an improvement - if change_val > 0: # Positive change that's good (e.g., higher throughput) - bar = " " * center + "|" + "+" * bar_length + " " * (center - bar_length) - else: # Negative change that's good (e.g., lower latency) - left_start = center - bar_length - bar = " " * left_start + "+" * bar_length + "|" + " " * center - return bar + f" {change_val:+.1f}%" - else: # This is a regression - if change_val > 0: # Positive change that's bad (e.g., higher latency) - bar = " " * center + "|" + "-" * bar_length + " " * (center - bar_length) - else: # Negative change that's bad (e.g., lower throughput) - left_start = center - bar_length - bar = " " * left_start + "-" * bar_length + "|" + " " * center - return bar + f" {change_val:+.1f}%" - - -def calculate_percentage_change(val1: float, val2: float) -> str: - """Calculate percentage change from val1 to val2""" - if val1 is None or val2 is None: - return "N/A" - if val1 == 0: - return "∞" if val2 != 0 else "0%" - - change = ((val2 - val1) / val1) * 100 - sign = "+" if change > 0 else "" - return f"{sign}{change:.1f}%" - - -def create_side_by_side_plot( - file1: str, file2: str, label1: str = None, label2: str = None -): - """Create ASCII side-by-side comparison plot""" - - # Default labels - if not label1: - label1 = file1.replace(".json", "") - if not label2: - label2 = file2.replace(".json", "") - - # Load data - data1 = load_bencher_file(file1) - data2 = load_bencher_file(file2) - - # Extract metrics - metrics1 = extract_metrics(data1) - metrics2 = extract_metrics(data2) - - # Get all unique metric keys - all_keys = set(metrics1.keys()) | set(metrics2.keys()) - all_keys = sorted(all_keys) - - if not all_keys: - print("No metrics found in the files") - return - - # We don't need global min/max anymore since we normalize each metric independently - - # Print header - total_width = 120 - print("=" * total_width) - title = f"BENCHMARK COMPARISON: {label1} vs {label2}" - print(f"{title:^{total_width}}") - print("=" * total_width) - print() - - # Column widths - metric_width = 40 - bar_width = 50 # Width for the diverging bar chart - values_width = 30 - - # Print column headers - print( - f"{'Metric':<{metric_width}} {'Performance Change':^{bar_width}} {'Values':^{values_width}}" - ) - print( - f"{'':<{metric_width}} {'':<{bar_width}} {label1 + ' → ' + label2:^{values_width}}" - ) - print("-" * total_width) - - # Process each metric - for key in all_keys: - val1 = metrics1.get(key) - val2 = metrics2.get(key) +def list_pr_files(path: str) -> List[str]: + """Return one or more PR result files in display order.""" + if os.path.isdir(path): + files = [ + os.path.join(path, name) + for name in sorted(os.listdir(path), key=jobid_sort_key) + if name.endswith(".json") and os.path.isfile(os.path.join(path, name)) + ] + if files: + return files + print(f"Error: No JSON files found in {path}") + sys.exit(1) - # Format metric name - metric_name = key[: metric_width - 1] if len(key) >= metric_width else key + return [path] - # Calculate change - change = calculate_percentage_change(val1, val2) - # Create diverging bar chart with context-aware direction - bar_display = create_diverging_bar(change, key, 40) +def pr_run_label(label: str, index: int, count: int, separator: bool) -> str: + text = label if count == 1 else f"{label} {index}" + return f">> {text}" if separator and index == 1 else text - # Format values - val1_str = f"{val1:.2f}" if val1 is not None else "N/A" - val2_str = f"{val2:.2f}" if val2 is not None else "N/A" - values_display = f"{val1_str} → {val2_str}" - # Print row - print( - f"{metric_name:<{metric_width}} {bar_display:<{bar_width}} {values_display:<{values_width}}" +def render_comparison(main_perf_dir: str, pr_path: str, pr_label: str) -> str: + """Render PR results as final points after main history.""" + main_files = list_perf_files(main_perf_dir) + main_runs = load_perf_data(main_perf_dir, main_files) + pr_files = list_pr_files(pr_path) + pr_runs = [ + ( + pr_run_label(pr_label, index, len(pr_files), bool(main_runs)), + None, + None, + load_bencher_file(pr_file), ) + for index, pr_file in enumerate(pr_files, 1) + ] + loaded = [*main_runs, *pr_runs] + pr_metrics = { + metric: benchmarks_with_metric(pr_runs, metric) + for metric, _, _ in METRIC_GROUPS + } + if not any(pr_metrics.values()): + print("Error: No supported metrics found in PR results", file=sys.stderr) + sys.exit(1) - print() - print("Legend:") - print(" +++| = Improvement (left side, better performance)") - print(" |--- = Regression (right side, worse performance)") - print(" | = No significant change (<2%)") - print(" + = Better performance") - print(" - = Worse performance") - print(" Scale: ±50% change uses full bar width") - print( - " Note: For performance metrics, lower latency = better, higher throughput = better" + main_history = main_runs[-MAIN_HISTORY_POINTS:] + lines = [] + if main_history: + lines.append(render_runs_table(main_history)) + + lines.extend( + ( + render_metric_group( + loaded, + metric, + title, + unit, + benchmarks=pr_metrics[metric], + reference_lines=True, + reference_loaded=main_runs or None, + reference_limit=MAIN_HISTORY_POINTS, + ) + ) + for metric, title, unit in METRIC_GROUPS + if pr_metrics[metric] ) + return "\n".join(lines) - # Summary statistics - print() - print("Summary:") - improvements = 0 - regressions = 0 - no_change = 0 - - for key in all_keys: - val1 = metrics1.get(key) - val2 = metrics2.get(key) - if val1 is not None and val2 is not None and val1 != 0: - change_val = ((val2 - val1) / val1) * 100 - if abs(change_val) < 2: - no_change += 1 - else: - # Use is_higher_better() to determine if this is actually an improvement or regression - higher_is_better = is_higher_better(key) - is_improvement = (change_val > 0 and higher_is_better) or ( - change_val < 0 and not higher_is_better - ) - if is_improvement: - improvements += 1 - else: - regressions += 1 - - total_compared = improvements + regressions + no_change - print(f" Total metrics compared: {total_compared}") - print(f" Improvements: {improvements}") - print(f" Regressions: {regressions}") - print(f" No significant change: {no_change}") - - -def main(): + +def main() -> None: parser = argparse.ArgumentParser( - description="Create ASCII side-by-side comparison of bencher.json files" + description="Create Markdown benchmark plots from main history and PR results." + ) + parser.add_argument( + "main_perf_dir", help="Directory containing main perf JSON files" ) - parser.add_argument("file1", help="First bencher.json file") - parser.add_argument("file2", help="Second bencher.json file") - parser.add_argument("--label1", help="Label for first file (default: filename)") - parser.add_argument("--label2", help="Label for second file (default: filename)") + parser.add_argument("pr_path", help="PR bencher.json file or directory") + parser.add_argument("--label2", default="PR", help="Label for PR result") args = parser.parse_args() - create_side_by_side_plot(args.file1, args.file2, args.label1, args.label2) + print(render_comparison(args.main_perf_dir, args.pr_path, args.label2)) if __name__ == "__main__": diff --git a/scripts/perf_report.py b/scripts/perf_report.py new file mode 100644 index 00000000000..c02d641cd21 --- /dev/null +++ b/scripts/perf_report.py @@ -0,0 +1,364 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the Apache 2.0 License. + +import html +import json +import os +import statistics +from typing import List, Optional, Tuple + +METRIC_GROUPS = [ + ("throughput", "Throughput", "tx/s"), + ("latency", "Latency", "ms"), + ("memory", "Memory", "bytes"), + ("rate", "Rate", "ops/s"), +] +CHART_MAX_POINTS = 30 +CHART_COLUMNS = 3 +CHART_WIDTH = 1100 +CHART_HEIGHT = 560 +CHART_CELL_WIDTH = str(CHART_WIDTH) +EWMA_ALPHA = 0.3 +DEFAULT_REPOSITORY = "microsoft/CCF" +METADATA_KEY = "__metadata" + +PerfRun = Tuple[str, Optional[str], Optional[str], dict] +ChartSeries = List[Tuple[str, float]] + + +def jobid_sort_key(name: str) -> Tuple[int, object]: + """Order perf files chronologically by their numeric job id.""" + stem = name[:-5] if name.endswith(".json") else name + try: + return (0, tuple(int(part) for part in stem.split("-"))) + except ValueError: + return (1, name) + + +def list_perf_files(directory: str) -> List[str]: + """Return perf files in the directory, ordered chronologically.""" + if not os.path.isdir(directory): + return [] + files = [ + name + for name in os.listdir(directory) + if os.path.isfile(os.path.join(directory, name)) + ] + return sorted(files, key=jobid_sort_key) + + +def run_label(name: str) -> str: + """Short x-axis label for a perf file.""" + stem = name[:-5] if name.endswith(".json") else name + parts = stem.split("-") + return parts[1] if len(parts) >= 2 else stem + + +def run_url(name: str) -> Optional[str]: + """GitHub Actions URL for a perf file, when the run id can be parsed.""" + stem = name[:-5] if name.endswith(".json") else name + parts = stem.split("-") + if not parts or not parts[0].isdigit(): + return None + + server_url = os.environ.get("GITHUB_SERVER_URL", "https://github.com").rstrip("/") + repository = os.environ.get("GITHUB_REPOSITORY", DEFAULT_REPOSITORY) + return f"{server_url}/{repository}/actions/runs/{parts[0]}" + + +def commit_url(metadata: dict) -> Optional[str]: + """GitHub commit URL from perf metadata, when available.""" + commit = metadata.get("commit") + if not isinstance(commit, str) or not commit: + return None + + server_url = metadata.get("server_url") or os.environ.get( + "GITHUB_SERVER_URL", "https://github.com" + ) + repository = metadata.get("repository") or os.environ.get( + "GITHUB_REPOSITORY", DEFAULT_REPOSITORY + ) + if not isinstance(server_url, str) or not isinstance(repository, str): + return None + return f"{server_url.rstrip('/')}/{repository}/commit/{commit}" + + +def load_perf_data(directory: str, files: List[str]) -> List[PerfRun]: + """Load (label, run_url, commit_url, data) for each readable perf file.""" + loaded: List[PerfRun] = [] + for name in files: + try: + with open(os.path.join(directory, name), "r") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError): + continue + if isinstance(data, dict): + metadata = data.get(METADATA_KEY, {}) + if not isinstance(metadata, dict): + metadata = {} + loaded.append((run_label(name), run_url(name), commit_url(metadata), data)) + return loaded + + +def metric_value(data: dict, benchmark: str, metric: str) -> Optional[float]: + """Return the numeric value of a benchmark metric, or None if absent.""" + metrics = data.get(benchmark) + if not isinstance(metrics, dict): + return None + entry = metrics.get(metric) + if isinstance(entry, dict): + value = entry.get("value") + else: + value = entry + return value if isinstance(value, (int, float)) else None + + +def benchmarks_with_metric(loaded: List[PerfRun], metric: str) -> List[str]: + """Sorted names of benchmarks that report the given metric in any run.""" + names = set() + for _, _, _, data in loaded: + for benchmark in data: + if benchmark == METADATA_KEY: + continue + if metric_value(data, benchmark, metric) is not None: + names.add(benchmark) + return sorted(names) + + +def ewma(values: List[float], alpha: float = EWMA_ALPHA) -> float: + """Return the exponentially weighted moving average of the values.""" + average = values[0] + for value in values[1:]: + average = alpha * value + (1 - alpha) * average + return average + + +def repeated_values(value: float, count: int) -> str: + """Render a constant series for every chart category.""" + return ", ".join(f"{value:.2f}" for _ in range(count)) + + +def chart_scale(values: List[float], unit: str) -> Tuple[float, str]: + """Scale large chart values to keep axis labels readable.""" + if values and max(abs(value) for value in values) >= 1000: + return 1000, f"K {unit}" + return 1, unit + + +def mermaid_string(value: str) -> str: + """Render a Mermaid string literal.""" + return json.dumps(value, ensure_ascii=True) + + +def render_mermaid_xychart( + series: ChartSeries, + benchmark: str, + metric: str, + unit: str, + *, + reverse_series: bool = False, + reference_lines: bool = True, + reference_series: Optional[ChartSeries] = None, +) -> str: + """Render a Mermaid xychart line chart for a single benchmark metric.""" + ordered_series = list(reversed(series)) if reverse_series else list(series) + labels = ", ".join(mermaid_string(label) for label, _ in ordered_series) + raw_values = [value for _, value in ordered_series] + values_for_scale = list(raw_values) + baseline = None + sigma = None + if reference_lines: + reference_values = [ + value for _, value in (reference_series if reference_series else series) + ] + baseline = ewma(reference_values) + sigma = statistics.pstdev(reference_values) if len(reference_values) > 1 else 0 + values_for_scale.extend([baseline, baseline - sigma, baseline + sigma]) + scale, chart_unit = chart_scale(values_for_scale, unit) + values = ", ".join(f"{value / scale:.2f}" for value in raw_values) + lines = [ + f"

{html.escape(benchmark)}

", + "", + "```mermaid", + "---", + "config:", + " xyChart:", + f" width: {CHART_WIDTH}", + f" height: {CHART_HEIGHT}", + " showTitle: false", + " xAxis:", + " labelFontSize: 24", + " titleFontSize: 28", + " yAxis:", + " labelFontSize: 22", + " titleFontSize: 28", + " showTitle: false", + " themeVariables:", + " xyChart:", + ' plotColorPalette: "#003E7E, #62B5E5, #C7E9FB, #C7E9FB"', + "---", + "xychart", + f" x-axis [{labels}]", + f' y-axis "{metric} ({chart_unit})"', + f" line [{values}]", + ] + if reference_lines: + assert baseline is not None + assert sigma is not None + lines.extend( + [ + f" line [{repeated_values(baseline / scale, len(raw_values))}]", + f" line [{repeated_values((baseline - sigma) / scale, len(raw_values))}]", + f" line [{repeated_values((baseline + sigma) / scale, len(raw_values))}]", + ] + ) + lines.extend(["```", ""]) + return "\n".join(lines) + + +def render_chart_table( + loaded: List[PerfRun], + benchmarks: List[str], + metric: str, + unit: str, + *, + reverse_series: bool = False, + reference_lines: bool = True, + reference_loaded: Optional[List[PerfRun]] = None, + reference_limit: Optional[int] = None, +) -> str: + """Render benchmark charts in a table.""" + lines = [""] + for index, benchmark in enumerate(benchmarks): + if index % CHART_COLUMNS == 0: + lines.append("") + lines.append(f'") + if index % CHART_COLUMNS == CHART_COLUMNS - 1: + lines.append("") + lines.append(f'') + remaining = len(benchmarks) % CHART_COLUMNS + if remaining: + for _ in range(CHART_COLUMNS - remaining): + lines.append(f'') + lines.append("") + lines.append("
') + reference_series = None + chart_reference_lines = reference_lines + if reference_loaded is not None: + limited_reference_series = [ + (label, value) + for label, _, _, data in reference_loaded + if (value := metric_value(data, benchmark, metric)) is not None + ] + if reference_limit is not None: + limited_reference_series = limited_reference_series[-reference_limit:] + if limited_reference_series: + reference_series = limited_reference_series + else: + chart_reference_lines = False + series = [ + *limited_reference_series, + *[ + (label, value) + for label, _, _, data in loaded[len(reference_loaded) :] + if (value := metric_value(data, benchmark, metric)) is not None + ], + ] + else: + series = [ + (label, value) + for label, _, _, data in loaded + if (value := metric_value(data, benchmark, metric)) is not None + ] + lines.append( + render_mermaid_xychart( + series, + benchmark, + metric, + unit, + reverse_series=reverse_series, + reference_lines=chart_reference_lines, + reference_series=reference_series, + ) + ) + lines.append("


") + lines.append("") + return "\n".join(lines) + + +def render_runs_table(loaded: List[PerfRun]) -> str: + """Render a compact x-axis label to commit map.""" + labels = [label for label, _, _, _ in loaded] + commit_links = [] + for _, _, commit, data in loaded: + metadata = data.get(METADATA_KEY, {}) + commit_sha = metadata.get("commit") if isinstance(metadata, dict) else None + short_commit = commit_sha[:8] if isinstance(commit_sha, str) else "" + commit_links.append( + f"[{short_commit}]({commit})" if commit and short_commit else short_commit + ) + + if not labels: + return "" + + return "\n".join( + [ + "### Commits", + "", + f"| Seq | {' | '.join(labels)} |", + f"| --- | {' | '.join('---' for _ in labels)} |", + f"| Commit | {' | '.join(commit_links)} |", + "", + ] + ) + + +def render_metric_group( + loaded: List[PerfRun], + metric: str, + title: str, + unit: str, + *, + benchmarks: Optional[List[str]] = None, + reverse_series: bool = False, + reference_lines: bool = True, + reference_loaded: Optional[List[PerfRun]] = None, + reference_limit: Optional[int] = None, +) -> str: + """Render one chart per benchmark that reports the given metric.""" + benchmarks = ( + benchmarks if benchmarks is not None else benchmarks_with_metric(loaded, metric) + ) + lines = [f"## {title} ({unit})", ""] + if not benchmarks: + lines.append(f"_No benchmarks with a `{metric}` metric found._") + lines.append("") + return "\n".join(lines) + + lines.append( + render_chart_table( + loaded, + benchmarks, + metric, + unit, + reverse_series=reverse_series, + reference_lines=reference_lines, + reference_loaded=reference_loaded, + reference_limit=reference_limit, + ) + ) + return "\n".join(lines) + + +def render_perf_summary(loaded: List[PerfRun]) -> str: + """Render all perf metric groups as markdown.""" + lines = [ + "# Performance summary", + "", + "_Each chart shows run values, an EWMA baseline, and +/-1 sigma reference lines._", + "", + render_runs_table(loaded), + ] + for metric, title, unit in METRIC_GROUPS: + lines.append(render_metric_group(loaded, metric, title, unit)) + return "\n".join(lines) diff --git a/scripts/perf_summary.py b/scripts/perf_summary.py index df9fd603cc4..441f8d9e62a 100644 --- a/scripts/perf_summary.py +++ b/scripts/perf_summary.py @@ -1,268 +1,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the Apache 2.0 License. -import os -import sys -import json import argparse -import html -import statistics -from typing import List, Optional, Tuple - -# Metric groups to chart over time. A chart is produced for every benchmark that -# reports each metric. -METRIC_GROUPS = [ - ("throughput", "Throughput", "tx/s"), - ("latency", "Latency", "ms"), - ("memory", "Memory", "bytes"), - ("rate", "Rate", "ops/s"), -] -CHART_MAX_POINTS = 30 -CHART_COLUMNS = 4 -CHART_CELL_WIDTH = f"{100 // CHART_COLUMNS}%" -EWMA_ALPHA = 0.3 -DEFAULT_REPOSITORY = "microsoft/CCF" -METADATA_KEY = "__metadata" - -PerfRun = Tuple[str, Optional[str], Optional[str], dict] -ChartSeries = List[Tuple[str, float]] - - -def jobid_sort_key(name: str) -> Tuple[int, object]: - """Order perf files chronologically by their numeric job id. - - File names have the form ``--.json`` where - each component increases over time, so ordering by the integer components - gives chronological order. Falls back to the name for unexpected formats. - """ - stem = name[:-5] if name.endswith(".json") else name - try: - return (0, tuple(int(part) for part in stem.split("-"))) - except ValueError: - return (1, name) - - -def list_perf_files(directory: str) -> List[str]: - """Return perf files in the directory, ordered chronologically (oldest first).""" - if not os.path.isdir(directory): - return [] - files = [ - name - for name in os.listdir(directory) - if os.path.isfile(os.path.join(directory, name)) - ] - return sorted(files, key=jobid_sort_key) - - -def run_label(name: str) -> str: - """Short x-axis label for a perf file: the run number when available.""" - stem = name[:-5] if name.endswith(".json") else name - parts = stem.split("-") - return parts[1] if len(parts) >= 2 else stem - - -def run_url(name: str) -> Optional[str]: - """GitHub Actions URL for a perf file, when the run id can be parsed.""" - stem = name[:-5] if name.endswith(".json") else name - parts = stem.split("-") - if not parts or not parts[0].isdigit(): - return None - - server_url = os.environ.get("GITHUB_SERVER_URL", "https://github.com").rstrip("/") - repository = os.environ.get("GITHUB_REPOSITORY", DEFAULT_REPOSITORY) - return f"{server_url}/{repository}/actions/runs/{parts[0]}" - - -def commit_url(metadata: dict) -> Optional[str]: - """GitHub commit URL from perf metadata, when available.""" - commit = metadata.get("commit") - if not isinstance(commit, str) or not commit: - return None - - server_url = metadata.get("server_url") or os.environ.get( - "GITHUB_SERVER_URL", "https://github.com" - ) - repository = metadata.get("repository") or os.environ.get( - "GITHUB_REPOSITORY", DEFAULT_REPOSITORY - ) - if not isinstance(server_url, str) or not isinstance(repository, str): - return None - return f"{server_url.rstrip('/')}/{repository}/commit/{commit}" - - -def load_perf_data(directory: str, files: List[str]) -> List[PerfRun]: - """Load (label, run_url, commit_url, data) for each readable perf file.""" - loaded: List[PerfRun] = [] - for name in files: - try: - with open(os.path.join(directory, name), "r") as f: - data = json.load(f) - except (OSError, json.JSONDecodeError): - continue - if isinstance(data, dict): - metadata = data.get(METADATA_KEY, {}) - if not isinstance(metadata, dict): - metadata = {} - loaded.append((run_label(name), run_url(name), commit_url(metadata), data)) - return loaded - - -def metric_value(data: dict, benchmark: str, metric: str) -> Optional[float]: - """Return the numeric value of a benchmark metric, or None if absent.""" - metrics = data.get(benchmark) - if not isinstance(metrics, dict): - return None - entry = metrics.get(metric) - if not isinstance(entry, dict): - return None - value = entry.get("value") - return value if isinstance(value, (int, float)) else None - - -def benchmarks_with_metric(loaded: List[PerfRun], metric: str) -> List[str]: - """Sorted names of benchmarks that report the given metric in any run.""" - names = set() - for _, _, _, data in loaded: - for benchmark in data: - if benchmark == METADATA_KEY: - continue - if metric_value(data, benchmark, metric) is not None: - names.add(benchmark) - return sorted(names) - - -def ewma(values: List[float], alpha: float = EWMA_ALPHA) -> float: - """Return the exponentially weighted moving average of the values.""" - average = values[0] - for value in values[1:]: - average = alpha * value + (1 - alpha) * average - return average - - -def repeated_values(value: float, count: int) -> str: - """Render a constant series for every chart category.""" - return ", ".join(f"{value:.2f}" for _ in range(count)) - - -def render_mermaid_xychart( - series: ChartSeries, - benchmark: str, - metric: str, - unit: str, -) -> str: - """Render a Mermaid xychart line chart for a single benchmark metric.""" - ordered_series = list(reversed(series)) - labels = ", ".join(f'"{label}"' for label, _ in ordered_series) - raw_values = [value for _, value in ordered_series] - values = ", ".join(f"{value:.2f}" for value in raw_values) - chronological_values = [value for _, value in series] - baseline = ewma(chronological_values) - sigma = ( - statistics.pstdev(chronological_values) if len(chronological_values) > 1 else 0 - ) - lines = [ - f"

{html.escape(benchmark)}

", - "", - "```mermaid", - "---", - "config:", - " xyChart:", - " width: 220", - " height: 320", - " showTitle: false", - " xAxis:", - " labelFontSize: 10", - " titleFontSize: 12", - " yAxis:", - " labelFontSize: 8", - " titleFontSize: 12", - " showTitle: false", - " themeVariables:", - " xyChart:", - ' plotColorPalette: "#003E7E, #62B5E5, #C7E9FB, #C7E9FB"', - "---", - "xychart horizontal", - f" x-axis [{labels}]", - f' y-axis "{metric} ({unit})"', - f" line [{values}]", - f" line [{repeated_values(baseline, len(raw_values))}]", - f" line [{repeated_values(baseline - sigma, len(raw_values))}]", - f" line [{repeated_values(baseline + sigma, len(raw_values))}]", - "```", - "", - ] - return "\n".join(lines) - - -def render_chart_table( - loaded: List[PerfRun], benchmarks: List[str], metric: str, unit: str -) -> str: - """Render benchmark charts in a four-column table.""" - lines = [''] - for index, benchmark in enumerate(benchmarks): - if index % CHART_COLUMNS == 0: - lines.append("") - lines.append(f'") - if index % CHART_COLUMNS == CHART_COLUMNS - 1: - lines.append("") - remaining = len(benchmarks) % CHART_COLUMNS - if remaining: - for _ in range(CHART_COLUMNS - remaining): - lines.append(f'') - lines.append("") - lines.append("
') - series = [ - (label, value) - for label, _, _, data in loaded - if (value := metric_value(data, benchmark, metric)) is not None - ] - lines.append(render_mermaid_xychart(series, benchmark, metric, unit)) - lines.append("
") - lines.append("") - return "\n".join(lines) - - -def render_runs_table(loaded: List[PerfRun]) -> str: - """Render a compact table of run labels, Actions runs, and commits.""" - lines = ["### Runs", "", "| Run | Actions | Commit |", "| --- | --- | --- |"] - for label, run, commit, data in reversed(loaded): - metadata = data.get(METADATA_KEY, {}) - commit_sha = metadata.get("commit") if isinstance(metadata, dict) else None - short_commit = commit_sha[:8] if isinstance(commit_sha, str) else "" - run_link = f"[run]({run})" if run else "" - commit_link = f"[{short_commit}]({commit})" if commit and short_commit else "" - lines.append(f"| {label} | {run_link} | {commit_link} |") - lines.append("") - return "\n".join(lines) - - -def render_metric_group( - loaded: List[PerfRun], metric: str, title: str, unit: str -) -> str: - """Render one chart per benchmark that reports the given metric.""" - benchmarks = benchmarks_with_metric(loaded, metric) - lines = [f"## {title} ({unit})", ""] - if not benchmarks: - lines.append(f"_No benchmarks with a `{metric}` metric found._") - lines.append("") - return "\n".join(lines) - - lines.append(render_chart_table(loaded, benchmarks, metric, unit)) - return "\n".join(lines) - +import sys -def render_perf_summary(loaded: List[PerfRun]) -> str: - """Render all perf metric groups as markdown.""" - lines = [ - "# Performance summary", - "", - "_Each chart shows run values, an EWMA baseline, and +/-1 sigma reference lines._", - "", - render_runs_table(loaded), - ] - for metric, title, unit in METRIC_GROUPS: - lines.append(render_metric_group(loaded, metric, title, unit)) - return "\n".join(lines) +from perf_report import CHART_MAX_POINTS, list_perf_files, load_perf_data +from perf_report import render_perf_summary def main() -> None: @@ -278,7 +21,6 @@ def main() -> None: args = parser.parse_args() files = list_perf_files(args.directory) - recent = files[-CHART_MAX_POINTS:] loaded = load_perf_data(args.directory, recent) print(render_perf_summary(loaded))