diff --git a/.github/workflows/performance-regression.yml b/.github/workflows/performance-regression.yml new file mode 100644 index 0000000000000..e177bbff684be --- /dev/null +++ b/.github/workflows/performance-regression.yml @@ -0,0 +1,445 @@ +name: Performance Regression Testing + +on: + workflow_dispatch: # allows manual triggering + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/performance-regression.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + 'tools/llama-bench/**', + 'scripts/performance-regression-detector.py', + 'scripts/compare-llama-bench.py' + ] + push: + branches: + - master + paths: [ + '.github/workflows/performance-regression.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + 'tools/llama-bench/**', + 'scripts/performance-regression-detector.py', + 'scripts/compare-llama-bench.py' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + REGRESSION_THRESHOLD: 5.0 + BASELINE_DB: performance-baseline.sqlite + RESULTS_DB: performance-results.sqlite + +jobs: + performance-cpu: + runs-on: ubuntu-latest + + steps: + - name: Clone + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for baseline comparison + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install Python dependencies + run: | + pip install GitPython tabulate matplotlib + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: performance-cpu + evict-old-files: 1d + + - name: Build llama-bench + run: | + cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=OFF + cmake --build build --target llama-bench llama-cli -j $(nproc) + + - name: Download test model + run: | + mkdir -p models + # Download TinyLlama test model if not present + if [ ! -f models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ]; then + wget -q --show-progress -O models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf + fi + + - name: Restore baseline database + id: restore-baseline + uses: actions/cache/restore@v4 + with: + path: ${{ env.BASELINE_DB }} + key: perf-baseline-cpu-${{ github.base_ref || 'master' }} + restore-keys: | + perf-baseline-cpu- + + - name: Run baseline benchmark (if no baseline exists) + if: steps.restore-baseline.outputs.cache-hit != 'true' + run: | + git checkout ${{ github.event.pull_request.base.sha || github.event.before || 'master' }} || true + cmake --build build --target llama-bench -j $(nproc) || true + ./build/bin/llama-bench \ + -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + -p 512 -n 128 -r 3 \ + -o sql | sqlite3 ${{ env.BASELINE_DB }} || true + git checkout - + + - name: Run current benchmark + run: | + ./build/bin/llama-bench \ + -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + -p 512 -n 128 -r 3 \ + -o sql | sqlite3 ${{ env.RESULTS_DB }} + + - name: Detect performance regressions + id: detect-regression + run: | + python scripts/performance-regression-detector.py \ + --baseline ${{ env.BASELINE_DB }} \ + --current ${{ env.RESULTS_DB }} \ + --threshold ${{ env.REGRESSION_THRESHOLD }} \ + --output regression-report.md + + # Set output for subsequent steps + if [ -f regression-detected.flag ]; then + echo "regression=true" >> $GITHUB_OUTPUT + else + echo "regression=false" >> $GITHUB_OUTPUT + fi + + - name: Upload regression report + if: always() + uses: actions/upload-artifact@v4 + with: + name: performance-report-cpu + path: | + regression-report.md + ${{ env.RESULTS_DB }} + ${{ env.BASELINE_DB }} + + - name: Comment on PR with results + if: github.event_name == 'pull_request' && always() + continue-on-error: true + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + let report = 'Performance Regression Test Results (CPU)\n\n'; + + if (fs.existsSync('regression-report.md')) { + report += fs.readFileSync('regression-report.md', 'utf8'); + } else { + report += 'No regression report generated.'; + } + + try { + await github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: report + }); + } catch (error) { + console.log('Could not post comment (likely permissions issue):', error.message); + } + + - name: Fail if regression detected + if: steps.detect-regression.outputs.regression == 'true' + run: | + echo "⚠️ Performance regression detected! Check the report for details." + exit 1 + + - name: Save baseline database + if: github.event_name == 'push' && github.ref == 'refs/heads/master' + uses: actions/cache/save@v4 + with: + path: ${{ env.RESULTS_DB }} + key: perf-baseline-cpu-master-${{ github.sha }} + + performance-cuda: + runs-on: gpu-runner + if: false # Disabled by default - enable when GPU runners are available + + steps: + - name: Clone + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install Python dependencies + run: | + pip install GitPython tabulate matplotlib + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: performance-cuda + evict-old-files: 1d + + - name: Build llama-bench with CUDA + run: | + cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_CUDA=ON \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=OFF + cmake --build build --target llama-bench llama-cli -j $(nproc) + + - name: Download test model + run: | + mkdir -p models + if [ ! -f models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ]; then + wget -q --show-progress -O models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf + fi + + - name: Restore baseline database + id: restore-baseline-cuda + uses: actions/cache/restore@v4 + with: + path: ${{ env.BASELINE_DB }} + key: perf-baseline-cuda-${{ github.base_ref || 'master' }} + restore-keys: | + perf-baseline-cuda- + + - name: Run baseline benchmark (if no baseline exists) + if: steps.restore-baseline-cuda.outputs.cache-hit != 'true' + run: | + git checkout ${{ github.event.pull_request.base.sha || github.event.before || 'master' }} || true + cmake --build build --target llama-bench -j $(nproc) || true + ./build/bin/llama-bench \ + -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + -ngl 99 -p 512 -n 128 -r 3 \ + -o sql | sqlite3 ${{ env.BASELINE_DB }} || true + git checkout - + + - name: Run current benchmark + run: | + ./build/bin/llama-bench \ + -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + -ngl 99 -p 512 -n 128 -r 3 \ + -o sql | sqlite3 ${{ env.RESULTS_DB }} + + - name: Detect performance regressions + id: detect-regression-cuda + run: | + python scripts/performance-regression-detector.py \ + --baseline ${{ env.BASELINE_DB }} \ + --current ${{ env.RESULTS_DB }} \ + --threshold ${{ env.REGRESSION_THRESHOLD }} \ + --output regression-report-cuda.md + + if [ -f regression-detected.flag ]; then + echo "regression=true" >> $GITHUB_OUTPUT + else + echo "regression=false" >> $GITHUB_OUTPUT + fi + + - name: Upload regression report + if: always() + uses: actions/upload-artifact@v4 + with: + name: performance-report-cuda + path: | + regression-report-cuda.md + ${{ env.RESULTS_DB }} + ${{ env.BASELINE_DB }} + + - name: Comment on PR with results + if: github.event_name == 'pull_request' && always() + continue-on-error: true + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + let report = 'Performance Regression Test Results (CUDA)\n\n'; + + if (fs.existsSync('regression-report-cuda.md')) { + report += fs.readFileSync('regression-report-cuda.md', 'utf8'); + } else { + report += 'No regression report generated.'; + } + + try { + await github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: report + }); + } catch (error) { + console.log('Could not post comment (likely permissions issue):', error.message); + } + + - name: Fail if regression detected + if: steps.detect-regression-cuda.outputs.regression == 'true' + run: | + echo "⚠️ Performance regression detected! Check the report for details." + exit 1 + + - name: Save baseline database + if: github.event_name == 'push' && github.ref == 'refs/heads/master' + uses: actions/cache/save@v4 + with: + path: ${{ env.RESULTS_DB }} + key: perf-baseline-cuda-master-${{ github.sha }} + + performance-metal: + runs-on: macos-14 # macOS with Apple Silicon for Metal testing + + steps: + - name: Clone + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install Python dependencies + run: | + pip install GitPython tabulate matplotlib + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: performance-metal + evict-old-files: 1d + + - name: Build llama-bench with Metal + run: | + cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_METAL=ON \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=OFF + cmake --build build --target llama-bench llama-cli -j $(sysctl -n hw.logicalcpu) + + - name: Download test model + run: | + mkdir -p models + if [ ! -f models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ]; then + wget -q --show-progress -O models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf + fi + + - name: Restore baseline database + id: restore-baseline-metal + uses: actions/cache/restore@v4 + with: + path: ${{ env.BASELINE_DB }} + key: perf-baseline-metal-${{ github.base_ref || 'master' }} + restore-keys: | + perf-baseline-metal- + + - name: Run baseline benchmark (if no baseline exists) + if: steps.restore-baseline-metal.outputs.cache-hit != 'true' + run: | + git checkout ${{ github.event.pull_request.base.sha || github.event.before || 'master' }} || true + cmake --build build --target llama-bench -j $(sysctl -n hw.logicalcpu) || true + ./build/bin/llama-bench \ + -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + -ngl 99 -p 512 -n 128 -r 3 \ + -o sql | sqlite3 ${{ env.BASELINE_DB }} || true + git checkout - + + - name: Run current benchmark + run: | + ./build/bin/llama-bench \ + -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + -ngl 99 -p 512 -n 128 -r 3 \ + -o sql | sqlite3 ${{ env.RESULTS_DB }} + + - name: Detect performance regressions + id: detect-regression-metal + run: | + python scripts/performance-regression-detector.py \ + --baseline ${{ env.BASELINE_DB }} \ + --current ${{ env.RESULTS_DB }} \ + --threshold ${{ env.REGRESSION_THRESHOLD }} \ + --output regression-report-metal.md + + if [ -f regression-detected.flag ]; then + echo "regression=true" >> $GITHUB_OUTPUT + else + echo "regression=false" >> $GITHUB_OUTPUT + fi + + - name: Upload regression report + if: always() + uses: actions/upload-artifact@v4 + with: + name: performance-report-metal + path: | + regression-report-metal.md + ${{ env.RESULTS_DB }} + ${{ env.BASELINE_DB }} + + - name: Comment on PR with results + if: github.event_name == 'pull_request' && always() + continue-on-error: true + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + let report = 'Performance Regression Test Results (Metal)\n\n'; + + if (fs.existsSync('regression-report-metal.md')) { + report += fs.readFileSync('regression-report-metal.md', 'utf8'); + } else { + report += 'No regression report generated.'; + } + + try { + await github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: report + }); + } catch (error) { + console.log('Could not post comment (likely permissions issue):', error.message); + } + + - name: Fail if regression detected + if: steps.detect-regression-metal.outputs.regression == 'true' + run: | + echo "⚠️ Performance regression detected! Check the report for details." + exit 1 + + - name: Save baseline database + if: github.event_name == 'push' && github.ref == 'refs/heads/master' + uses: actions/cache/save@v4 + with: + path: ${{ env.RESULTS_DB }} + key: perf-baseline-metal-master-${{ github.sha }} diff --git a/docs/performance-regression-testing.md b/docs/performance-regression-testing.md new file mode 100644 index 0000000000000..eb94b5db62840 --- /dev/null +++ b/docs/performance-regression-testing.md @@ -0,0 +1,366 @@ +# Performance Regression Testing + +This document describes the automated performance regression testing system for llama.cpp, implemented as part of JIRA ticket AT-105. + +## Overview + +The performance regression testing system automatically detects performance degradations in llama.cpp by comparing benchmark results against established baselines. It integrates with GitHub Actions CI/CD pipelines and provides automated alerts when performance regressions exceed a configurable threshold (default: 5%). + +## Components + +### 1. GitHub Actions Workflow + +**File:** `.github/workflows/performance-regression.yml` + +The workflow runs performance benchmarks on different hardware backends (CPU, CUDA, Metal) for every pull request and push to master. It: + +- Builds the `llama-bench` target +- Downloads a test model (TinyLlama 1.1B) +- Runs benchmarks with consistent parameters +- Compares results against cached baselines +- Posts results as PR comments +- Fails the build if regressions are detected + +**Jobs:** +- `performance-cpu`: Runs on Ubuntu with CPU backend +- `performance-cuda`: Runs on GPU runners (disabled by default) +- `performance-metal`: Runs on macOS with Apple Silicon + +**Triggers:** +- Pull requests to any branch +- Pushes to master branch +- Manual workflow dispatch + +### 2. Performance Regression Detector + +**File:** `scripts/performance-regression-detector.py` + +Python script that analyzes benchmark results and detects performance regressions. + +**Usage:** +```bash +python3 scripts/performance-regression-detector.py \ + --baseline baseline.sqlite \ + --current current.sqlite \ + --threshold 5.0 \ + --output regression-report.md +``` + +**Features:** +- Compares multiple performance metrics (tokens/second, latency) +- Configurable regression threshold +- Generates markdown and JSON reports +- Creates flag file when regressions detected +- Integrates with existing llama-bench SQLite schema + +**Key Metrics:** +- `avg_ts`: Average tokens per second (higher is better) +- `avg_ns`: Average latency in nanoseconds (lower is better) +- `model_size`: Model memory footprint (lower is better) + +### 3. Enhanced Comparison Script + +**File:** `scripts/compare-llama-bench.py` (enhanced) + +The existing comparison script has been extended with CI automation support. + +**New Features:** +- `--ci-mode`: Enable CI-specific formatting and behavior +- `--baseline-db`: Path to baseline database for tracking +- `--save-baseline`: Save current results as new baseline +- `--json-output`: Export comparison results to JSON + +**Example:** +```bash +python3 scripts/compare-llama-bench.py \ + -i results.sqlite \ + --ci-mode \ + --json-output comparison.json +``` + +### 4. Database Schema Extensions + +**Files:** +- `scripts/db-schema-migration.sql`: SQL migration script +- `scripts/apply-db-migration.py`: Migration application tool + +The database schema has been extended to support: + +**New Tables:** +- `performance_baselines`: Stores baseline snapshots +- `performance_history`: Historical performance data +- `regression_alerts`: Logged regression detections +- `memory_leak_logs`: Memory leak monitoring results + +**Views:** +- `latest_baselines`: Active baseline information +- `regression_summary`: Aggregated regression statistics +- `memory_leak_summary`: Memory leak detection summary + +**Applying Migrations:** +```bash +python3 scripts/apply-db-migration.py -d llama-bench.sqlite +``` + +### 5. Memory Leak Monitoring + +**File:** `scripts/memory-leak-monitor.py` + +Integrates with the existing `llama-memory.h` interfaces to detect memory leaks and excessive memory consumption. + +**Usage:** +```bash +python3 scripts/memory-leak-monitor.py \ + --benchmark-output benchmark.log \ + --test-log test.log \ + --database results.sqlite \ + --commit abc123 \ + --report memory-report.md +``` + +**Features:** +- Parses benchmark output for memory usage patterns +- Detects memory leaks (threshold: 1 MB) +- Monitors excessive memory usage (threshold: 16 GB) +- Logs results to database +- Generates markdown reports + +**Memory Status Codes** (from `llama-memory.h`): +- `0`: `LLAMA_MEMORY_STATUS_SUCCESS` +- `1`: `LLAMA_MEMORY_STATUS_NO_UPDATE` +- `2`: `LLAMA_MEMORY_STATUS_FAILED_PREPARE` +- `3`: `LLAMA_MEMORY_STATUS_FAILED_COMPUTE` + +### 6. CMake Test Integration + +**File:** `tests/CMakeLists.txt` (extended) + +A new performance test target has been added: + +```cmake +llama_test_cmd( + ${CMAKE_BINARY_DIR}/bin/llama-bench + NAME test-performance-regression-cpu + LABEL "performance" + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ARGS -p 512 -n 128 -r 3 -o sql +) +``` + +**Running Performance Tests:** +```bash +cd build +ctest -L performance --verbose +``` + +## Workflow + +### For Pull Requests + +1. Developer opens a PR with code changes +2. GitHub Actions triggers the performance regression workflow +3. The workflow: + - Builds llama-bench with the PR code + - Restores the baseline database from cache + - If no baseline exists, creates one from the base commit + - Runs benchmarks with the current code + - Compares results using the regression detector +4. Results are posted as a PR comment +5. Build fails if regressions exceed 5% threshold + +### For Master Branch Commits + +1. Code is merged to master +2. GitHub Actions runs the workflow +3. Benchmark results are cached as the new baseline +4. Historical data is stored in the database +5. Future PRs compare against this baseline + +### Manual Baseline Management + +**Creating a Baseline:** +```bash +# Run benchmarks +./build/bin/llama-bench -m model.gguf -p 512 -n 128 -r 3 -o sql | sqlite3 baseline.sqlite + +# Save as baseline +python3 scripts/apply-db-migration.py -d baseline.sqlite +sqlite3 baseline.sqlite "INSERT INTO performance_baselines (baseline_name, commit_sha, created_at) VALUES ('v1.0', '$(git rev-parse HEAD)', '$(date -Iseconds)')" +``` + +**Comparing Against Baseline:** +```bash +# Run current benchmarks +./build/bin/llama-bench -m model.gguf -p 512 -n 128 -r 3 -o sql | sqlite3 current.sqlite + +# Detect regressions +python3 scripts/performance-regression-detector.py \ + --baseline baseline.sqlite \ + --current current.sqlite \ + --threshold 5.0 +``` + +## Configuration + +### Environment Variables + +- `REGRESSION_THRESHOLD`: Regression detection threshold (default: 5.0) +- `BASELINE_DB`: Baseline database filename (default: performance-baseline.sqlite) +- `RESULTS_DB`: Results database filename (default: performance-results.sqlite) + +### Workflow Customization + +Edit `.github/workflows/performance-regression.yml` to: + +- Change benchmark parameters (prompt length, generation tokens, repetitions) +- Add/remove backend configurations +- Modify caching strategy +- Adjust model selection + +### Threshold Configuration + +The default 5% threshold can be adjusted per-backend or per-metric: + +```python +# In performance-regression-detector.py +PERFORMANCE_METRICS = { + "avg_ts": { + "threshold": 5.0, # Custom threshold for this metric + ... + } +} +``` + +## Reports + +### Regression Report Format + +```markdown +# Performance Regression Analysis Report + +**Generated:** 2025-09-29 12:34:56 +**Threshold:** 5.0% + +## Summary +- Total Benchmarks Compared: 10 +- Regressions Found: 2 +- Improvements Found: 3 +- Stable Benchmarks: 5 + +## ⚠️ Performance Regressions Detected + +### TinyLlama-1.1B | backend:CPU | p:512 | g:128 + +⚠️ **Average Tokens/Second**: +- Baseline: 45.23 tokens/s +- Current: 42.15 tokens/s +- Change: ↓ 6.81% + +... +``` + +### Memory Leak Report Format + +```markdown +# Memory Leak Monitoring Report + +**Generated:** 2025-09-29 12:34:56 + +## ⚠️ Memory Leaks Detected + +### benchmark +- Initial Memory: 1234.56 MB +- Final Memory: 1250.78 MB +- Leaked: 16.22 MB +``` + +## Troubleshooting + +### No Baseline Available + +If the baseline cache is empty or expired: + +1. The workflow will attempt to build the baseline from the base commit +2. If that fails, it will create a baseline from the current code +3. Subsequent runs will use this baseline + +### False Positives + +Regressions can be marked as false positives in the database: + +```sql +UPDATE regression_alerts +SET status = 'false_positive', notes = 'Expected due to architectural change' +WHERE id = ; +``` + +### Excessive Memory Usage Warnings + +If memory usage exceeds thresholds: + +1. Review the memory leak report +2. Check for memory leaks using valgrind or similar tools +3. Adjust the threshold if legitimate increased usage + +## Integration with CI/CD + +### GitHub Actions Artifacts + +The workflow uploads artifacts containing: +- Regression reports (markdown) +- SQLite databases (baseline and current) +- Memory leak reports + +**Downloading Artifacts:** +```bash +gh run download -n performance-report-cpu +``` + +### PR Comments + +The workflow automatically comments on PRs with: +- Summary of regression detection +- Links to detailed reports +- Pass/fail status + +### Build Status + +The workflow sets the build status to: +- ✅ **Success**: No regressions detected +- ❌ **Failure**: Regressions exceed threshold +- ⚠️ **Warning**: Issues detected but below threshold + +## Best Practices + +1. **Run locally before PR**: Test performance changes locally +2. **Review memory reports**: Check for memory leaks regularly +3. **Update baselines**: Refresh baselines after major changes +4. **Monitor trends**: Use historical data to identify gradual degradation +5. **Document exceptions**: Note expected performance changes in PR descriptions + +## Future Enhancements + +Potential improvements to the system: + +- [ ] Add GPU-specific benchmarks when runners available +- [ ] Implement trend analysis over multiple commits +- [ ] Add visualization dashboard for historical performance +- [ ] Support for custom benchmark configurations per PR +- [ ] Integration with performance profiling tools +- [ ] Automatic bisection for regression identification +- [ ] Multi-model benchmark comparisons + +## References + +- [llama-bench documentation](../tools/llama-bench/README.md) +- [compare-llama-bench.py usage](../scripts/compare-llama-bench.py) +- [llama-memory.h interface](../src/llama-memory.h) +- [GitHub Actions workflows](../.github/workflows/) + +## Support + +For issues or questions: +- Check existing GitHub issues +- Review workflow run logs +- Examine generated reports +- Contact the performance testing team diff --git a/scripts/apply-db-migration.py b/scripts/apply-db-migration.py new file mode 100755 index 0000000000000..479517dcbae02 --- /dev/null +++ b/scripts/apply-db-migration.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Database Schema Migration Tool for llama.cpp Performance Testing + +This script applies schema migrations to extend the existing llama-bench +SQLite database with baseline tracking, historical data, and regression alerting. +""" + +import argparse +import logging +import os +import sqlite3 +import sys +from pathlib import Path + +logger = logging.getLogger("apply-db-migration") + + +def apply_migration(db_path: str, migration_sql_path: str, dry_run: bool = False) -> bool: + """ + Apply database schema migration. + + Args: + db_path: Path to SQLite database + migration_sql_path: Path to SQL migration script + dry_run: If True, print migration without applying + + Returns: + True if successful, False otherwise + """ + if not os.path.exists(migration_sql_path): + logger.error(f"Migration script not found: {migration_sql_path}") + return False + + with open(migration_sql_path, 'r') as f: + migration_sql = f.read() + + if dry_run: + logger.info("Dry run mode - migration would execute:") + logger.info(migration_sql) + return True + + try: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + cursor.executescript(migration_sql) + conn.commit() + + logger.info(f"Migration applied successfully to {db_path}") + + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = [row[0] for row in cursor.fetchall()] + logger.info(f"Database tables: {', '.join(tables)}") + + conn.close() + return True + + except sqlite3.Error as e: + logger.error(f"Migration failed: {e}") + return False + + +def check_migration_status(db_path: str) -> dict: + """ + Check if migration has been applied to the database. + + Args: + db_path: Path to SQLite database + + Returns: + Dictionary with migration status information + """ + if not os.path.exists(db_path): + return {"exists": False, "migrated": False, "tables": []} + + try: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = [row[0] for row in cursor.fetchall()] + + migration_tables = [ + "performance_baselines", + "performance_history", + "regression_alerts", + "memory_leak_logs" + ] + + migrated = all(table in tables for table in migration_tables) + + conn.close() + + return { + "exists": True, + "migrated": migrated, + "tables": tables, + "migration_tables_present": [t for t in migration_tables if t in tables] + } + + except sqlite3.Error as e: + logger.error(f"Error checking database: {e}") + return {"exists": True, "migrated": False, "error": str(e)} + + +def main(): + """Main entry point for migration tool.""" + parser = argparse.ArgumentParser( + description="Apply database schema migrations for performance testing" + ) + parser.add_argument( + "--database", + "-d", + required=True, + help="Path to SQLite database" + ) + parser.add_argument( + "--migration", + "-m", + help="Path to migration SQL script (default: scripts/db-schema-migration.sql)" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print migration without applying" + ) + parser.add_argument( + "--check", + action="store_true", + help="Check migration status without applying" + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Enable verbose logging" + ) + + args = parser.parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + + if not args.migration: + script_dir = Path(__file__).parent + args.migration = script_dir / "db-schema-migration.sql" + + if args.check: + status = check_migration_status(args.database) + logger.info(f"Database exists: {status.get('exists', False)}") + logger.info(f"Migration applied: {status.get('migrated', False)}") + if status.get('tables'): + logger.info(f"Tables present: {', '.join(status['tables'])}") + if status.get('migration_tables_present'): + logger.info(f"Migration tables: {', '.join(status['migration_tables_present'])}") + sys.exit(0 if status.get('migrated', False) else 1) + + success = apply_migration(args.database, str(args.migration), args.dry_run) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index c45c83fdb55c3..9c0cc7833bbf6 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -9,6 +9,7 @@ import sqlite3 import sys from collections.abc import Iterator, Sequence +from datetime import datetime from glob import glob from typing import Any, Optional, Union @@ -175,6 +176,11 @@ parser.add_argument("--plot_x", help="parameter to use as x axis for plotting (default: n_depth)", default="n_depth") parser.add_argument("--plot_log_scale", action="store_true", help="use log scale for x axis in plots (off by default)") +parser.add_argument("--ci-mode", action="store_true", help="Enable CI mode for automated workflows") +parser.add_argument("--baseline-db", help="Path to baseline database for tracking performance over time") +parser.add_argument("--save-baseline", help="Save current results as baseline to specified database path") +parser.add_argument("--json-output", help="Export comparison results to JSON file for automated processing") + known_args, unknown_args = parser.parse_known_args() logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO) @@ -1091,3 +1097,23 @@ def make_axes(num_groups, max_cols=2, base_size=(8, 4)): floatfmt=".2f", tablefmt=known_args.output )) + +if known_args.json_output: + output_data = { + "baseline": name_baseline, + "compare": name_compare, + "tool": tool, + "headers": headers, + "table": table, + "timestamp": datetime.now().isoformat() + } + with open(known_args.json_output, "w") as f: + json.dump(output_data, f, indent=2, default=str) + logger.info(f"JSON output written to {known_args.json_output}") + +if known_args.save_baseline: + import shutil + if input_file: + baseline_path = known_args.save_baseline + shutil.copy(input_file[0], baseline_path) + logger.info(f"Baseline saved to {baseline_path}") diff --git a/scripts/db-schema-migration.sql b/scripts/db-schema-migration.sql new file mode 100644 index 0000000000000..acd3da348eedd --- /dev/null +++ b/scripts/db-schema-migration.sql @@ -0,0 +1,116 @@ + +CREATE TABLE IF NOT EXISTS performance_baselines ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + baseline_name TEXT NOT NULL, + commit_sha TEXT NOT NULL, + branch_name TEXT DEFAULT 'master', + created_at TEXT NOT NULL, + description TEXT, + is_active INTEGER DEFAULT 1, + UNIQUE(baseline_name, commit_sha) +); + +CREATE TABLE IF NOT EXISTS performance_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + test_time TEXT NOT NULL, + build_commit TEXT NOT NULL, + model_type TEXT, + backends TEXT, + n_gpu_layers INTEGER, + avg_ts REAL, + avg_ns INTEGER, + stddev_ts REAL, + stddev_ns INTEGER, + cpu_info TEXT, + gpu_info TEXT, + n_threads INTEGER, + n_prompt INTEGER, + n_gen INTEGER, + memory_usage_kb INTEGER, + memory_status TEXT, + FOREIGN KEY (build_commit) REFERENCES performance_baselines(commit_sha) +); + +CREATE TABLE IF NOT EXISTS regression_alerts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + alert_time TEXT NOT NULL, + baseline_commit TEXT NOT NULL, + current_commit TEXT NOT NULL, + benchmark_key TEXT NOT NULL, + metric_name TEXT NOT NULL, + baseline_value REAL NOT NULL, + current_value REAL NOT NULL, + change_percentage REAL NOT NULL, + threshold_percentage REAL NOT NULL, + severity TEXT CHECK(severity IN ('warning', 'critical')) DEFAULT 'warning', + status TEXT CHECK(status IN ('open', 'investigating', 'resolved', 'false_positive')) DEFAULT 'open', + notes TEXT +); + +CREATE TABLE IF NOT EXISTS memory_leak_logs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + test_time TEXT NOT NULL, + build_commit TEXT NOT NULL, + test_name TEXT NOT NULL, + memory_status TEXT NOT NULL, + initial_memory_kb INTEGER, + final_memory_kb INTEGER, + peak_memory_kb INTEGER, + leaked_memory_kb INTEGER, + status_code INTEGER, + error_message TEXT +); + +CREATE INDEX IF NOT EXISTS idx_performance_history_commit ON performance_history(build_commit); +CREATE INDEX IF NOT EXISTS idx_performance_history_time ON performance_history(test_time); +CREATE INDEX IF NOT EXISTS idx_performance_history_model ON performance_history(model_type); +CREATE INDEX IF NOT EXISTS idx_regression_alerts_time ON regression_alerts(alert_time); +CREATE INDEX IF NOT EXISTS idx_regression_alerts_status ON regression_alerts(status); +CREATE INDEX IF NOT EXISTS idx_memory_leak_logs_commit ON memory_leak_logs(build_commit); +CREATE INDEX IF NOT EXISTS idx_memory_leak_logs_time ON memory_leak_logs(test_time); + +CREATE VIEW IF NOT EXISTS latest_baselines AS +SELECT + b.baseline_name, + b.commit_sha, + b.branch_name, + b.created_at, + COUNT(h.id) as benchmark_count +FROM performance_baselines b +LEFT JOIN performance_history h ON b.commit_sha = h.build_commit +WHERE b.is_active = 1 +GROUP BY b.id +ORDER BY b.created_at DESC; + +CREATE VIEW IF NOT EXISTS regression_summary AS +SELECT + current_commit, + COUNT(*) as total_regressions, + SUM(CASE WHEN severity = 'critical' THEN 1 ELSE 0 END) as critical_count, + SUM(CASE WHEN severity = 'warning' THEN 1 ELSE 0 END) as warning_count, + AVG(ABS(change_percentage)) as avg_degradation +FROM regression_alerts +WHERE status = 'open' +GROUP BY current_commit +ORDER BY total_regressions DESC; + +CREATE VIEW IF NOT EXISTS memory_leak_summary AS +SELECT + build_commit, + COUNT(*) as total_tests, + SUM(CASE WHEN memory_status = 'LLAMA_MEMORY_STATUS_SUCCESS' THEN 1 ELSE 0 END) as passed_tests, + SUM(CASE WHEN leaked_memory_kb > 0 THEN 1 ELSE 0 END) as leak_detected, + SUM(leaked_memory_kb) as total_leaked_kb +FROM memory_leak_logs +GROUP BY build_commit +ORDER BY test_time DESC; + +CREATE TRIGGER IF NOT EXISTS update_regression_alert_timestamp +AFTER UPDATE ON regression_alerts +FOR EACH ROW +WHEN OLD.status != NEW.status +BEGIN + UPDATE regression_alerts + SET alert_time = datetime('now') + WHERE id = NEW.id; +END; diff --git a/scripts/memory-leak-monitor.py b/scripts/memory-leak-monitor.py new file mode 100755 index 0000000000000..cf57a21cbdabf --- /dev/null +++ b/scripts/memory-leak-monitor.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python3 +""" +Memory Leak Monitoring Integration for llama.cpp + +This script integrates with the CI pipeline to monitor memory consumption +patterns using the existing llama_memory_status interfaces from llama-memory.h. + +It parses benchmark results and test logs to detect memory leaks and excessive +memory consumption that could indicate performance issues. +""" + +import argparse +import logging +import os +import re +import sqlite3 +import sys +from datetime import datetime +from typing import Dict, List, Optional + +logger = logging.getLogger("memory-leak-monitor") + +MEMORY_STATUS_CODES = { + 0: "LLAMA_MEMORY_STATUS_SUCCESS", + 1: "LLAMA_MEMORY_STATUS_NO_UPDATE", + 2: "LLAMA_MEMORY_STATUS_FAILED_PREPARE", + 3: "LLAMA_MEMORY_STATUS_FAILED_COMPUTE", +} + +LEAK_THRESHOLD_KB = 1024 # 1 MB leak threshold +EXCESSIVE_MEMORY_THRESHOLD_GB = 16 # 16 GB excessive usage threshold + + +class MemoryLeakMonitor: + """Monitor memory usage and detect potential leaks.""" + + def __init__(self, db_path: Optional[str] = None): + """ + Initialize memory leak monitor. + + Args: + db_path: Optional path to SQLite database for storing results + """ + self.db_path = db_path + self.leaks_detected: List[Dict] = [] + self.memory_issues: List[Dict] = [] + + def parse_benchmark_output(self, output_file: str) -> List[Dict]: + """ + Parse benchmark output for memory usage information. + + Args: + output_file: Path to benchmark output file + + Returns: + List of memory usage records + """ + memory_records = [] + + if not os.path.exists(output_file): + logger.warning(f"Output file not found: {output_file}") + return memory_records + + with open(output_file, 'r') as f: + content = f.read() + + size_pattern = r'model size:\s+(\d+\.?\d*)\s+(GiB|MiB|GB|MB)' + usage_pattern = r'memory usage:\s+(\d+)\s+(MB|KB|GB)' + peak_pattern = r'peak memory:\s+(\d+\.?\d*)\s+(GB|MB)' + + for pattern_name, pattern in [ + ("model_size", size_pattern), + ("memory_usage", usage_pattern), + ("peak_memory", peak_pattern) + ]: + matches = re.finditer(pattern, content, re.IGNORECASE) + for match in matches: + value = float(match.group(1)) + unit = match.group(2).upper() + + if unit in ["GIB", "GB"]: + value_kb = value * 1024 * 1024 + elif unit in ["MIB", "MB"]: + value_kb = value * 1024 + else: + value_kb = value + + memory_records.append({ + "type": pattern_name, + "value_kb": value_kb, + "original_value": match.group(1), + "unit": match.group(2) + }) + + logger.info(f"Parsed {len(memory_records)} memory records from {output_file}") + return memory_records + + def parse_test_logs(self, log_file: str) -> List[Dict]: + """ + Parse test logs for memory status codes. + + Args: + log_file: Path to test log file + + Returns: + List of memory status records + """ + status_records = [] + + if not os.path.exists(log_file): + logger.warning(f"Log file not found: {log_file}") + return status_records + + with open(log_file, 'r') as f: + lines = f.readlines() + + status_pattern = r'memory.*status[:\s]+(\d+)' + failure_pattern = r'memory.*(?:leak|fail|error)' + + for i, line in enumerate(lines): + status_match = re.search(status_pattern, line, re.IGNORECASE) + if status_match: + status_code = int(status_match.group(1)) + status_name = MEMORY_STATUS_CODES.get(status_code, "UNKNOWN") + + status_records.append({ + "line_number": i + 1, + "status_code": status_code, + "status_name": status_name, + "line": line.strip(), + "is_failure": status_code >= 2 + }) + + failure_match = re.search(failure_pattern, line, re.IGNORECASE) + if failure_match: + status_records.append({ + "line_number": i + 1, + "status_code": -1, + "status_name": "MEMORY_ISSUE_DETECTED", + "line": line.strip(), + "is_failure": True + }) + + logger.info(f"Parsed {len(status_records)} memory status records from {log_file}") + return status_records + + def detect_leaks( + self, + initial_memory_kb: float, + final_memory_kb: float, + test_name: str = "unknown" + ) -> Optional[Dict]: + """ + Detect memory leaks by comparing initial and final memory usage. + + Args: + initial_memory_kb: Initial memory usage in KB + final_memory_kb: Final memory usage in KB + test_name: Name of the test + + Returns: + Leak information if detected, None otherwise + """ + leaked_kb = final_memory_kb - initial_memory_kb + + if leaked_kb > LEAK_THRESHOLD_KB: + leak_info = { + "test_name": test_name, + "initial_memory_kb": initial_memory_kb, + "final_memory_kb": final_memory_kb, + "leaked_memory_kb": leaked_kb, + "leaked_memory_mb": leaked_kb / 1024, + "timestamp": datetime.now().isoformat() + } + self.leaks_detected.append(leak_info) + logger.warning(f"Memory leak detected in {test_name}: {leak_info['leaked_memory_mb']:.2f} MB") + return leak_info + + return None + + def check_excessive_usage(self, memory_kb: float, test_name: str = "unknown") -> bool: + """ + Check if memory usage exceeds acceptable thresholds. + + Args: + memory_kb: Memory usage in KB + test_name: Name of the test + + Returns: + True if excessive usage detected + """ + memory_gb = memory_kb / (1024 * 1024) + + if memory_gb > EXCESSIVE_MEMORY_THRESHOLD_GB: + issue = { + "test_name": test_name, + "memory_kb": memory_kb, + "memory_gb": memory_gb, + "threshold_gb": EXCESSIVE_MEMORY_THRESHOLD_GB, + "timestamp": datetime.now().isoformat() + } + self.memory_issues.append(issue) + logger.warning( + f"Excessive memory usage in {test_name}: " + f"{memory_gb:.2f} GB (threshold: {EXCESSIVE_MEMORY_THRESHOLD_GB} GB)" + ) + return True + + return False + + def store_results(self, build_commit: str): + """ + Store memory monitoring results in database. + + Args: + build_commit: Git commit SHA + """ + if not self.db_path: + logger.warning("No database path configured, skipping storage") + return + + try: + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS memory_leak_logs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + test_time TEXT NOT NULL, + build_commit TEXT NOT NULL, + test_name TEXT NOT NULL, + memory_status TEXT NOT NULL, + initial_memory_kb INTEGER, + final_memory_kb INTEGER, + peak_memory_kb INTEGER, + leaked_memory_kb INTEGER, + status_code INTEGER, + error_message TEXT + ) + """) + + for leak in self.leaks_detected: + cursor.execute(""" + INSERT INTO memory_leak_logs ( + test_time, build_commit, test_name, memory_status, + initial_memory_kb, final_memory_kb, leaked_memory_kb, + status_code + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, ( + datetime.now().isoformat(), + build_commit, + leak["test_name"], + "LEAK_DETECTED", + int(leak["initial_memory_kb"]), + int(leak["final_memory_kb"]), + int(leak["leaked_memory_kb"]), + -1 + )) + + for issue in self.memory_issues: + cursor.execute(""" + INSERT INTO memory_leak_logs ( + test_time, build_commit, test_name, memory_status, + peak_memory_kb, status_code + ) VALUES (?, ?, ?, ?, ?, ?) + """, ( + datetime.now().isoformat(), + build_commit, + issue["test_name"], + "EXCESSIVE_USAGE", + int(issue["memory_kb"]), + -2 + )) + + conn.commit() + conn.close() + logger.info(f"Stored {len(self.leaks_detected)} leak records and " + f"{len(self.memory_issues)} excessive usage records") + + except sqlite3.Error as e: + logger.error(f"Error storing results: {e}") + + def generate_report(self, output_file: str): + """ + Generate a markdown report of memory monitoring results. + + Args: + output_file: Path to output markdown file + """ + with open(output_file, 'w') as f: + f.write("# Memory Leak Monitoring Report\n\n") + f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") + + if self.leaks_detected: + f.write("## ⚠️ Memory Leaks Detected\n\n") + f.write(f"**Total Leaks:** {len(self.leaks_detected)}\n\n") + + for leak in self.leaks_detected: + f.write(f"### {leak['test_name']}\n\n") + f.write(f"- **Initial Memory:** {leak['initial_memory_kb'] / 1024:.2f} MB\n") + f.write(f"- **Final Memory:** {leak['final_memory_kb'] / 1024:.2f} MB\n") + f.write(f"- **Leaked:** {leak['leaked_memory_mb']:.2f} MB\n\n") + else: + f.write("## ✅ No Memory Leaks Detected\n\n") + + if self.memory_issues: + f.write("## ⚠️ Excessive Memory Usage\n\n") + f.write(f"**Total Issues:** {len(self.memory_issues)}\n\n") + + for issue in self.memory_issues: + f.write(f"### {issue['test_name']}\n\n") + f.write(f"- **Memory Used:** {issue['memory_gb']:.2f} GB\n") + f.write(f"- **Threshold:** {issue['threshold_gb']} GB\n\n") + + logger.info(f"Report written to {output_file}") + + +def main(): + """Main entry point for memory leak monitor.""" + parser = argparse.ArgumentParser( + description="Monitor memory usage and detect leaks in llama.cpp benchmarks" + ) + parser.add_argument( + "--benchmark-output", + help="Path to benchmark output file to analyze" + ) + parser.add_argument( + "--test-log", + help="Path to test log file to analyze" + ) + parser.add_argument( + "--database", + help="Path to SQLite database for storing results" + ) + parser.add_argument( + "--commit", + default="unknown", + help="Git commit SHA for this run" + ) + parser.add_argument( + "--report", + default="memory-report.md", + help="Output path for memory report" + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose logging" + ) + + args = parser.parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + + monitor = MemoryLeakMonitor(db_path=args.database) + + if args.benchmark_output: + memory_records = monitor.parse_benchmark_output(args.benchmark_output) + if len(memory_records) >= 2: + initial = memory_records[0]["value_kb"] + final = memory_records[-1]["value_kb"] + monitor.detect_leaks(initial, final, "benchmark") + + if args.test_log: + status_records = monitor.parse_test_logs(args.test_log) + for record in status_records: + if record.get("is_failure"): + logger.error(f"Memory failure at line {record['line_number']}: {record['line']}") + + if args.database: + monitor.store_results(args.commit) + + monitor.generate_report(args.report) + + has_issues = bool(monitor.leaks_detected or monitor.memory_issues) + sys.exit(1 if has_issues else 0) + + +if __name__ == "__main__": + main() diff --git a/scripts/performance-regression-detector.py b/scripts/performance-regression-detector.py new file mode 100755 index 0000000000000..30efd55214476 --- /dev/null +++ b/scripts/performance-regression-detector.py @@ -0,0 +1,407 @@ +#!/usr/bin/env python3 +""" +Performance Regression Detector for llama.cpp + +This script compares benchmark results between baseline and current runs, +detecting performance regressions above a configurable threshold. + +It integrates with the existing llama-bench SQLite database schema and +provides automated alerts for CI/CD pipelines. +""" + +import argparse +import json +import logging +import os +import sqlite3 +import sys +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +LLAMA_BENCH_DB_FIELDS = [ + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", + "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", + "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", + "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", + "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", + "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", +] + +BENCHMARK_KEY_PROPERTIES = [ + "model_type", "n_batch", "n_ubatch", "n_threads", "n_gpu_layers", + "backends", "n_prompt", "n_gen", "flash_attn" +] + +PERFORMANCE_METRICS = { + "avg_ts": { + "name": "Average Tokens/Second", + "unit": "tokens/s", + "direction": "higher_is_better", + "format": "{:.2f}" + }, + "avg_ns": { + "name": "Average Latency", + "unit": "ns", + "direction": "lower_is_better", + "format": "{:.0f}" + }, + "model_size": { + "name": "Model Size", + "unit": "bytes", + "direction": "lower_is_better", + "format": "{:.0f}" + } +} + +logger = logging.getLogger("performance-regression-detector") + + +class RegressionDetector: + """Detects performance regressions by comparing benchmark results.""" + + def __init__(self, baseline_db: str, current_db: str, threshold: float = 5.0): + """ + Initialize the regression detector. + + Args: + baseline_db: Path to baseline SQLite database + current_db: Path to current run SQLite database + threshold: Regression threshold percentage (default: 5.0) + """ + self.baseline_db = baseline_db + self.current_db = current_db + self.threshold = threshold + self.regressions: List[Dict[str, Any]] = [] + self.improvements: List[Dict[str, Any]] = [] + self.stable: List[Dict[str, Any]] = [] + + def load_results(self, db_path: str) -> List[Dict[str, Any]]: + """Load benchmark results from SQLite database.""" + if not os.path.exists(db_path): + logger.warning(f"Database not found: {db_path}") + return [] + + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + try: + cursor.execute("SELECT * FROM test") + results = [dict(row) for row in cursor.fetchall()] + logger.info(f"Loaded {len(results)} results from {db_path}") + return results + except sqlite3.OperationalError as e: + logger.error(f"Error reading database {db_path}: {e}") + return [] + finally: + conn.close() + + def match_benchmark( + self, baseline: Dict[str, Any], current_results: List[Dict[str, Any]] + ) -> Optional[Dict[str, Any]]: + """ + Find matching benchmark in current results based on key properties. + + Args: + baseline: Baseline benchmark result + current_results: List of current benchmark results + + Returns: + Matching benchmark or None if no match found + """ + for current in current_results: + match = True + for key in BENCHMARK_KEY_PROPERTIES: + if key not in baseline or key not in current: + continue + if baseline[key] != current[key]: + match = False + break + if match: + return current + return None + + def calculate_regression( + self, metric_name: str, baseline_value: float, current_value: float + ) -> Tuple[float, bool]: + """ + Calculate regression percentage and determine if it exceeds threshold. + + Args: + metric_name: Name of the metric being compared + baseline_value: Baseline metric value + current_value: Current metric value + + Returns: + Tuple of (change_percentage, is_regression) + """ + if baseline_value == 0: + return 0.0, False + + metric_info = PERFORMANCE_METRICS.get(metric_name, {}) + direction = metric_info.get("direction", "higher_is_better") + + change_pct = ((current_value - baseline_value) / baseline_value) * 100 + + if direction == "higher_is_better": + is_regression = change_pct < -self.threshold + else: + is_regression = change_pct > self.threshold + + return change_pct, is_regression + + def analyze(self) -> Dict[str, Any]: + """ + Analyze benchmark results and detect regressions. + + Returns: + Dictionary containing analysis results + """ + logger.info("Starting regression analysis...") + + baseline_results = self.load_results(self.baseline_db) + current_results = self.load_results(self.current_db) + + if not baseline_results: + logger.warning("No baseline results found - skipping comparison") + return { + "status": "no_baseline", + "message": "No baseline results available for comparison", + "regressions": [], + "improvements": [], + "stable": [], + "summary": { + "total_benchmarks": 0, + "regressions_found": 0, + "improvements_found": 0, + "stable_benchmarks": 0 + } + } + + if not current_results: + logger.error("No current results found") + return { + "status": "error", + "message": "No current results found", + "regressions": [], + "improvements": [], + "stable": [], + "summary": { + "total_benchmarks": len(baseline_results), + "regressions_found": 0, + "improvements_found": 0, + "stable_benchmarks": 0 + } + } + + for baseline in baseline_results: + current = self.match_benchmark(baseline, current_results) + if not current: + logger.debug(f"No matching current result for baseline: {baseline.get('model_type')}") + continue + + benchmark_key = self._generate_benchmark_key(baseline) + has_regression = False + has_improvement = False + changes = {} + + for metric_name in ["avg_ts", "avg_ns"]: + if metric_name not in baseline or metric_name not in current: + continue + + baseline_value = baseline[metric_name] + current_value = current[metric_name] + + if baseline_value is None or current_value is None: + continue + + change_pct, is_regression = self.calculate_regression( + metric_name, baseline_value, current_value + ) + + metric_info = PERFORMANCE_METRICS[metric_name] + changes[metric_name] = { + "baseline": baseline_value, + "current": current_value, + "change_pct": change_pct, + "is_regression": is_regression, + "unit": metric_info["unit"], + "name": metric_info["name"] + } + + if is_regression: + has_regression = True + elif abs(change_pct) > self.threshold: + has_improvement = True + + result = { + "benchmark_key": benchmark_key, + "baseline": baseline, + "current": current, + "changes": changes + } + + if has_regression: + self.regressions.append(result) + elif has_improvement: + self.improvements.append(result) + else: + self.stable.append(result) + + status = "regression" if self.regressions else "pass" + + return { + "status": status, + "threshold": self.threshold, + "regressions": self.regressions, + "improvements": self.improvements, + "stable": self.stable, + "summary": { + "total_benchmarks": len(baseline_results), + "regressions_found": len(self.regressions), + "improvements_found": len(self.improvements), + "stable_benchmarks": len(self.stable) + } + } + + def _generate_benchmark_key(self, benchmark: Dict[str, Any]) -> str: + """Generate a human-readable key for a benchmark.""" + parts = [] + if "model_type" in benchmark: + parts.append(benchmark["model_type"]) + if "backends" in benchmark: + parts.append(f"backend:{benchmark['backends']}") + if "n_gpu_layers" in benchmark and benchmark["n_gpu_layers"]: + parts.append(f"ngl:{benchmark['n_gpu_layers']}") + if "n_prompt" in benchmark: + parts.append(f"p:{benchmark['n_prompt']}") + if "n_gen" in benchmark: + parts.append(f"g:{benchmark['n_gen']}") + return " | ".join(parts) if parts else "unknown" + + def generate_report(self, output_path: str, analysis: Dict[str, Any]): + """Generate a markdown report of the regression analysis.""" + with open(output_path, "w") as f: + f.write("# Performance Regression Analysis Report\n\n") + f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") + f.write(f"**Threshold:** {self.threshold}%\n\n") + + summary = analysis["summary"] + f.write("## Summary\n\n") + f.write(f"- **Total Benchmarks Compared:** {summary['total_benchmarks']}\n") + f.write(f"- **Regressions Found:** {summary['regressions_found']}\n") + f.write(f"- **Improvements Found:** {summary['improvements_found']}\n") + f.write(f"- **Stable Benchmarks:** {summary['stable_benchmarks']}\n\n") + + if analysis["status"] == "regression": + f.write("## ⚠️ Performance Regressions Detected\n\n") + for reg in analysis["regressions"]: + self._write_benchmark_section(f, reg, "Regression") + elif analysis["status"] == "no_baseline": + f.write("## ℹ️ No Baseline Available\n\n") + f.write(analysis["message"] + "\n\n") + else: + f.write("## ✅ No Performance Regressions Detected\n\n") + + if analysis["improvements"]: + f.write("## 📈 Performance Improvements\n\n") + for imp in analysis["improvements"]: + self._write_benchmark_section(f, imp, "Improvement") + + if analysis.get("stable"): + f.write("## 📊 Stable Performance\n\n") + f.write(f"**{len(analysis['stable'])} benchmarks** showed stable performance ") + f.write(f"(within ±{self.threshold}% threshold).\n\n") + + logger.info(f"Report written to {output_path}") + + def _write_benchmark_section(self, f, result: Dict[str, Any], section_type: str): + """Write a benchmark comparison section to the report.""" + f.write(f"### {result['benchmark_key']}\n\n") + + for metric_name, change in result["changes"].items(): + if not change.get("is_regression") and section_type == "Regression": + continue + if change.get("is_regression") and section_type == "Improvement": + continue + + baseline_val = change["baseline"] + current_val = change["current"] + change_pct = change["change_pct"] + unit = change["unit"] + name = change["name"] + + icon = "⚠️" if change.get("is_regression") else "✅" + direction = "↓" if change_pct < 0 else "↑" + + f.write(f"{icon} **{name}**:\n") + f.write(f"- Baseline: {baseline_val:.2f} {unit}\n") + f.write(f"- Current: {current_val:.2f} {unit}\n") + f.write(f"- Change: {direction} {abs(change_pct):.2f}%\n\n") + + +def main(): + """Main entry point for the regression detector.""" + parser = argparse.ArgumentParser( + description="Detect performance regressions in llama.cpp benchmarks" + ) + parser.add_argument( + "--baseline", + required=True, + help="Path to baseline SQLite database" + ) + parser.add_argument( + "--current", + required=True, + help="Path to current run SQLite database" + ) + parser.add_argument( + "--threshold", + type=float, + default=5.0, + help="Regression threshold percentage (default: 5.0)" + ) + parser.add_argument( + "--output", + default="regression-report.md", + help="Output path for regression report (default: regression-report.md)" + ) + parser.add_argument( + "--json-output", + help="Optional JSON output path for machine-readable results" + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose logging" + ) + + args = parser.parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + + detector = RegressionDetector(args.baseline, args.current, args.threshold) + analysis = detector.analyze() + + detector.generate_report(args.output, analysis) + + if args.json_output: + with open(args.json_output, "w") as f: + json.dump(analysis, f, indent=2, default=str) + logger.info(f"JSON report written to {args.json_output}") + + if analysis["status"] == "regression": + Path("regression-detected.flag").touch() + logger.error(f"Performance regression detected: {len(analysis['regressions'])} benchmarks affected") + sys.exit(1) + else: + logger.info("No performance regressions detected") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 91719577564a9..dd183734c7c32 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -219,3 +219,14 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd) get_filename_component(TEST_TARGET test-c.c NAME_WE) add_executable(${TEST_TARGET} test-c.c) target_link_libraries(${TEST_TARGET} PRIVATE llama) + +# Performance regression test suite +if (EXISTS ${CMAKE_SOURCE_DIR}/tools/llama-bench/llama-bench.cpp) + llama_test_cmd( + ${CMAKE_BINARY_DIR}/bin/llama-bench + NAME test-performance-regression-cpu + LABEL "performance" + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ARGS -p 512 -n 128 -r 3 -o sql + ) +endif()