diff --git a/.github/workflows/performance-regression.yml b/.github/workflows/performance-regression.yml
new file mode 100644
index 0000000000000..e177bbff684be
--- /dev/null
+++ b/.github/workflows/performance-regression.yml
@@ -0,0 +1,445 @@
+name: Performance Regression Testing
+
+on:
+  workflow_dispatch: # allows manual triggering
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/performance-regression.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      'tools/llama-bench/**',
+      'scripts/performance-regression-detector.py',
+      'scripts/compare-llama-bench.py'
+    ]
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/performance-regression.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      'tools/llama-bench/**',
+      'scripts/performance-regression-detector.py',
+      'scripts/compare-llama-bench.py'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  REGRESSION_THRESHOLD: 5.0
+  BASELINE_DB: performance-baseline.sqlite
+  RESULTS_DB: performance-results.sqlite
+
+jobs:
+  performance-cpu:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Full history for baseline comparison
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+
+      - name: Install Python dependencies
+        run: |
+          pip install GitPython tabulate matplotlib
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: performance-cpu
+          evict-old-files: 1d
+
+      - name: Build llama-bench
+        run: |
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=OFF
+          cmake --build build --target llama-bench llama-cli -j $(nproc)
+
+      - name: Download test model
+        run: |
+          mkdir -p models
+          # Download TinyLlama test model if not present
+          if [ ! -f models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ]; then
+            wget -q --show-progress -O models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+              https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+          fi
+
+      - name: Restore baseline database
+        id: restore-baseline
+        uses: actions/cache/restore@v4
+        with:
+          path: ${{ env.BASELINE_DB }}
+          key: perf-baseline-cpu-${{ github.base_ref || 'master' }}
+          restore-keys: |
+            perf-baseline-cpu-
+
+      - name: Run baseline benchmark (if no baseline exists)
+        if: steps.restore-baseline.outputs.cache-hit != 'true'
+        run: |
+          git checkout ${{ github.event.pull_request.base.sha || github.event.before || 'master' }} || true
+          cmake --build build --target llama-bench -j $(nproc) || true
+          ./build/bin/llama-bench \
+            -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+            -p 512 -n 128 -r 3 \
+            -o sql | sqlite3 ${{ env.BASELINE_DB }} || true
+          git checkout -
+
+      - name: Run current benchmark
+        run: |
+          ./build/bin/llama-bench \
+            -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+            -p 512 -n 128 -r 3 \
+            -o sql | sqlite3 ${{ env.RESULTS_DB }}
+
+      - name: Detect performance regressions
+        id: detect-regression
+        run: |
+          python scripts/performance-regression-detector.py \
+            --baseline ${{ env.BASELINE_DB }} \
+            --current ${{ env.RESULTS_DB }} \
+            --threshold ${{ env.REGRESSION_THRESHOLD }} \
+            --output regression-report.md
+
+          # Set output for subsequent steps
+          if [ -f regression-detected.flag ]; then
+            echo "regression=true" >> $GITHUB_OUTPUT
+          else
+            echo "regression=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Upload regression report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: performance-report-cpu
+          path: |
+            regression-report.md
+            ${{ env.RESULTS_DB }}
+            ${{ env.BASELINE_DB }}
+
+      - name: Comment on PR with results
+        if: github.event_name == 'pull_request' && always()
+        continue-on-error: true
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            let report = 'Performance Regression Test Results (CPU)\n\n';
+
+            if (fs.existsSync('regression-report.md')) {
+              report += fs.readFileSync('regression-report.md', 'utf8');
+            } else {
+              report += 'No regression report generated.';
+            }
+
+            try {
+              await github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: report
+              });
+            } catch (error) {
+              console.log('Could not post comment (likely permissions issue):', error.message);
+            }
+
+      - name: Fail if regression detected
+        if: steps.detect-regression.outputs.regression == 'true'
+        run: |
+          echo "⚠️ Performance regression detected! Check the report for details."
+          exit 1
+
+      - name: Save baseline database
+        if: github.event_name == 'push' && github.ref == 'refs/heads/master'
+        uses: actions/cache/save@v4
+        with:
+          path: ${{ env.RESULTS_DB }}
+          key: perf-baseline-cpu-master-${{ github.sha }}
+
+  performance-cuda:
+    runs-on: gpu-runner
+    if: false  # Disabled by default - enable when GPU runners are available
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+
+      - name: Install Python dependencies
+        run: |
+          pip install GitPython tabulate matplotlib
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: performance-cuda
+          evict-old-files: 1d
+
+      - name: Build llama-bench with CUDA
+        run: |
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_CUDA=ON \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=OFF
+          cmake --build build --target llama-bench llama-cli -j $(nproc)
+
+      - name: Download test model
+        run: |
+          mkdir -p models
+          if [ ! -f models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ]; then
+            wget -q --show-progress -O models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+              https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+          fi
+
+      - name: Restore baseline database
+        id: restore-baseline-cuda
+        uses: actions/cache/restore@v4
+        with:
+          path: ${{ env.BASELINE_DB }}
+          key: perf-baseline-cuda-${{ github.base_ref || 'master' }}
+          restore-keys: |
+            perf-baseline-cuda-
+
+      - name: Run baseline benchmark (if no baseline exists)
+        if: steps.restore-baseline-cuda.outputs.cache-hit != 'true'
+        run: |
+          git checkout ${{ github.event.pull_request.base.sha || github.event.before || 'master' }} || true
+          cmake --build build --target llama-bench -j $(nproc) || true
+          ./build/bin/llama-bench \
+            -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+            -ngl 99 -p 512 -n 128 -r 3 \
+            -o sql | sqlite3 ${{ env.BASELINE_DB }} || true
+          git checkout -
+
+      - name: Run current benchmark
+        run: |
+          ./build/bin/llama-bench \
+            -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+            -ngl 99 -p 512 -n 128 -r 3 \
+            -o sql | sqlite3 ${{ env.RESULTS_DB }}
+
+      - name: Detect performance regressions
+        id: detect-regression-cuda
+        run: |
+          python scripts/performance-regression-detector.py \
+            --baseline ${{ env.BASELINE_DB }} \
+            --current ${{ env.RESULTS_DB }} \
+            --threshold ${{ env.REGRESSION_THRESHOLD }} \
+            --output regression-report-cuda.md
+
+          if [ -f regression-detected.flag ]; then
+            echo "regression=true" >> $GITHUB_OUTPUT
+          else
+            echo "regression=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Upload regression report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: performance-report-cuda
+          path: |
+            regression-report-cuda.md
+            ${{ env.RESULTS_DB }}
+            ${{ env.BASELINE_DB }}
+
+      - name: Comment on PR with results
+        if: github.event_name == 'pull_request' && always()
+        continue-on-error: true
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            let report = 'Performance Regression Test Results (CUDA)\n\n';
+
+            if (fs.existsSync('regression-report-cuda.md')) {
+              report += fs.readFileSync('regression-report-cuda.md', 'utf8');
+            } else {
+              report += 'No regression report generated.';
+            }
+
+            try {
+              await github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: report
+              });
+            } catch (error) {
+              console.log('Could not post comment (likely permissions issue):', error.message);
+            }
+
+      - name: Fail if regression detected
+        if: steps.detect-regression-cuda.outputs.regression == 'true'
+        run: |
+          echo "⚠️ Performance regression detected! Check the report for details."
+          exit 1
+
+      - name: Save baseline database
+        if: github.event_name == 'push' && github.ref == 'refs/heads/master'
+        uses: actions/cache/save@v4
+        with:
+          path: ${{ env.RESULTS_DB }}
+          key: perf-baseline-cuda-master-${{ github.sha }}
+
+  performance-metal:
+    runs-on: macos-14  # macOS with Apple Silicon for Metal testing
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+
+      - name: Install Python dependencies
+        run: |
+          pip install GitPython tabulate matplotlib
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: performance-metal
+          evict-old-files: 1d
+
+      - name: Build llama-bench with Metal
+        run: |
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_METAL=ON \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=OFF
+          cmake --build build --target llama-bench llama-cli -j $(sysctl -n hw.logicalcpu)
+
+      - name: Download test model
+        run: |
+          mkdir -p models
+          if [ ! -f models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ]; then
+            wget -q --show-progress -O models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+              https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+          fi
+
+      - name: Restore baseline database
+        id: restore-baseline-metal
+        uses: actions/cache/restore@v4
+        with:
+          path: ${{ env.BASELINE_DB }}
+          key: perf-baseline-metal-${{ github.base_ref || 'master' }}
+          restore-keys: |
+            perf-baseline-metal-
+
+      - name: Run baseline benchmark (if no baseline exists)
+        if: steps.restore-baseline-metal.outputs.cache-hit != 'true'
+        run: |
+          git checkout ${{ github.event.pull_request.base.sha || github.event.before || 'master' }} || true
+          cmake --build build --target llama-bench -j $(sysctl -n hw.logicalcpu) || true
+          ./build/bin/llama-bench \
+            -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+            -ngl 99 -p 512 -n 128 -r 3 \
+            -o sql | sqlite3 ${{ env.BASELINE_DB }} || true
+          git checkout -
+
+      - name: Run current benchmark
+        run: |
+          ./build/bin/llama-bench \
+            -m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+            -ngl 99 -p 512 -n 128 -r 3 \
+            -o sql | sqlite3 ${{ env.RESULTS_DB }}
+
+      - name: Detect performance regressions
+        id: detect-regression-metal
+        run: |
+          python scripts/performance-regression-detector.py \
+            --baseline ${{ env.BASELINE_DB }} \
+            --current ${{ env.RESULTS_DB }} \
+            --threshold ${{ env.REGRESSION_THRESHOLD }} \
+            --output regression-report-metal.md
+
+          if [ -f regression-detected.flag ]; then
+            echo "regression=true" >> $GITHUB_OUTPUT
+          else
+            echo "regression=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Upload regression report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: performance-report-metal
+          path: |
+            regression-report-metal.md
+            ${{ env.RESULTS_DB }}
+            ${{ env.BASELINE_DB }}
+
+      - name: Comment on PR with results
+        if: github.event_name == 'pull_request' && always()
+        continue-on-error: true
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            let report = 'Performance Regression Test Results (Metal)\n\n';
+
+            if (fs.existsSync('regression-report-metal.md')) {
+              report += fs.readFileSync('regression-report-metal.md', 'utf8');
+            } else {
+              report += 'No regression report generated.';
+            }
+
+            try {
+              await github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: report
+              });
+            } catch (error) {
+              console.log('Could not post comment (likely permissions issue):', error.message);
+            }
+
+      - name: Fail if regression detected
+        if: steps.detect-regression-metal.outputs.regression == 'true'
+        run: |
+          echo "⚠️ Performance regression detected! Check the report for details."
+          exit 1
+
+      - name: Save baseline database
+        if: github.event_name == 'push' && github.ref == 'refs/heads/master'
+        uses: actions/cache/save@v4
+        with:
+          path: ${{ env.RESULTS_DB }}
+          key: perf-baseline-metal-master-${{ github.sha }}
diff --git a/docs/performance-regression-testing.md b/docs/performance-regression-testing.md
new file mode 100644
index 0000000000000..eb94b5db62840
--- /dev/null
+++ b/docs/performance-regression-testing.md
@@ -0,0 +1,366 @@
+# Performance Regression Testing
+
+This document describes the automated performance regression testing system for llama.cpp, implemented as part of JIRA ticket AT-105.
+
+## Overview
+
+The performance regression testing system automatically detects performance degradations in llama.cpp by comparing benchmark results against established baselines. It integrates with GitHub Actions CI/CD pipelines and provides automated alerts when performance regressions exceed a configurable threshold (default: 5%).
+
+## Components
+
+### 1. GitHub Actions Workflow
+
+**File:** `.github/workflows/performance-regression.yml`
+
+The workflow runs performance benchmarks on different hardware backends (CPU, CUDA, Metal) for every pull request and push to master. It:
+
+- Builds the `llama-bench` target
+- Downloads a test model (TinyLlama 1.1B)
+- Runs benchmarks with consistent parameters
+- Compares results against cached baselines
+- Posts results as PR comments
+- Fails the build if regressions are detected
+
+**Jobs:**
+- `performance-cpu`: Runs on Ubuntu with CPU backend
+- `performance-cuda`: Runs on GPU runners (disabled by default)
+- `performance-metal`: Runs on macOS with Apple Silicon
+
+**Triggers:**
+- Pull requests to any branch
+- Pushes to master branch
+- Manual workflow dispatch
+
+### 2. Performance Regression Detector
+
+**File:** `scripts/performance-regression-detector.py`
+
+Python script that analyzes benchmark results and detects performance regressions.
+
+**Usage:**
+```bash
+python3 scripts/performance-regression-detector.py \
+  --baseline baseline.sqlite \
+  --current current.sqlite \
+  --threshold 5.0 \
+  --output regression-report.md
+```
+
+**Features:**
+- Compares multiple performance metrics (tokens/second, latency)
+- Configurable regression threshold
+- Generates markdown and JSON reports
+- Creates flag file when regressions detected
+- Integrates with existing llama-bench SQLite schema
+
+**Key Metrics:**
+- `avg_ts`: Average tokens per second (higher is better)
+- `avg_ns`: Average latency in nanoseconds (lower is better)
+- `model_size`: Model memory footprint (lower is better)
+
+### 3. Enhanced Comparison Script
+
+**File:** `scripts/compare-llama-bench.py` (enhanced)
+
+The existing comparison script has been extended with CI automation support.
+
+**New Features:**
+- `--ci-mode`: Enable CI-specific formatting and behavior
+- `--baseline-db`: Path to baseline database for tracking
+- `--save-baseline`: Save current results as new baseline
+- `--json-output`: Export comparison results to JSON
+
+**Example:**
+```bash
+python3 scripts/compare-llama-bench.py \
+  -i results.sqlite \
+  --ci-mode \
+  --json-output comparison.json
+```
+
+### 4. Database Schema Extensions
+
+**Files:**
+- `scripts/db-schema-migration.sql`: SQL migration script
+- `scripts/apply-db-migration.py`: Migration application tool
+
+The database schema has been extended to support:
+
+**New Tables:**
+- `performance_baselines`: Stores baseline snapshots
+- `performance_history`: Historical performance data
+- `regression_alerts`: Logged regression detections
+- `memory_leak_logs`: Memory leak monitoring results
+
+**Views:**
+- `latest_baselines`: Active baseline information
+- `regression_summary`: Aggregated regression statistics
+- `memory_leak_summary`: Memory leak detection summary
+
+**Applying Migrations:**
+```bash
+python3 scripts/apply-db-migration.py -d llama-bench.sqlite
+```
+
+### 5. Memory Leak Monitoring
+
+**File:** `scripts/memory-leak-monitor.py`
+
+Integrates with the existing `llama-memory.h` interfaces to detect memory leaks and excessive memory consumption.
+
+**Usage:**
+```bash
+python3 scripts/memory-leak-monitor.py \
+  --benchmark-output benchmark.log \
+  --test-log test.log \
+  --database results.sqlite \
+  --commit abc123 \
+  --report memory-report.md
+```
+
+**Features:**
+- Parses benchmark output for memory usage patterns
+- Detects memory leaks (threshold: 1 MB)
+- Monitors excessive memory usage (threshold: 16 GB)
+- Logs results to database
+- Generates markdown reports
+
+**Memory Status Codes** (from `llama-memory.h`):
+- `0`: `LLAMA_MEMORY_STATUS_SUCCESS`
+- `1`: `LLAMA_MEMORY_STATUS_NO_UPDATE`
+- `2`: `LLAMA_MEMORY_STATUS_FAILED_PREPARE`
+- `3`: `LLAMA_MEMORY_STATUS_FAILED_COMPUTE`
+
+### 6. CMake Test Integration
+
+**File:** `tests/CMakeLists.txt` (extended)
+
+A new performance test target has been added:
+
+```cmake
+llama_test_cmd(
+    ${CMAKE_BINARY_DIR}/bin/llama-bench
+    NAME test-performance-regression-cpu
+    LABEL "performance"
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    ARGS -p 512 -n 128 -r 3 -o sql
+)
+```
+
+**Running Performance Tests:**
+```bash
+cd build
+ctest -L performance --verbose
+```
+
+## Workflow
+
+### For Pull Requests
+
+1. Developer opens a PR with code changes
+2. GitHub Actions triggers the performance regression workflow
+3. The workflow:
+   - Builds llama-bench with the PR code
+   - Restores the baseline database from cache
+   - If no baseline exists, creates one from the base commit
+   - Runs benchmarks with the current code
+   - Compares results using the regression detector
+4. Results are posted as a PR comment
+5. Build fails if regressions exceed 5% threshold
+
+### For Master Branch Commits
+
+1. Code is merged to master
+2. GitHub Actions runs the workflow
+3. Benchmark results are cached as the new baseline
+4. Historical data is stored in the database
+5. Future PRs compare against this baseline
+
+### Manual Baseline Management
+
+**Creating a Baseline:**
+```bash
+# Run benchmarks
+./build/bin/llama-bench -m model.gguf -p 512 -n 128 -r 3 -o sql | sqlite3 baseline.sqlite
+
+# Save as baseline
+python3 scripts/apply-db-migration.py -d baseline.sqlite
+sqlite3 baseline.sqlite "INSERT INTO performance_baselines (baseline_name, commit_sha, created_at) VALUES ('v1.0', '$(git rev-parse HEAD)', '$(date -Iseconds)')"
+```
+
+**Comparing Against Baseline:**
+```bash
+# Run current benchmarks
+./build/bin/llama-bench -m model.gguf -p 512 -n 128 -r 3 -o sql | sqlite3 current.sqlite
+
+# Detect regressions
+python3 scripts/performance-regression-detector.py \
+  --baseline baseline.sqlite \
+  --current current.sqlite \
+  --threshold 5.0
+```
+
+## Configuration
+
+### Environment Variables
+
+- `REGRESSION_THRESHOLD`: Regression detection threshold (default: 5.0)
+- `BASELINE_DB`: Baseline database filename (default: performance-baseline.sqlite)
+- `RESULTS_DB`: Results database filename (default: performance-results.sqlite)
+
+### Workflow Customization
+
+Edit `.github/workflows/performance-regression.yml` to:
+
+- Change benchmark parameters (prompt length, generation tokens, repetitions)
+- Add/remove backend configurations
+- Modify caching strategy
+- Adjust model selection
+
+### Threshold Configuration
+
+The default 5% threshold can be adjusted per-backend or per-metric:
+
+```python
+# In performance-regression-detector.py
+PERFORMANCE_METRICS = {
+    "avg_ts": {
+        "threshold": 5.0,  # Custom threshold for this metric
+        ...
+    }
+}
+```
+
+## Reports
+
+### Regression Report Format
+
+```markdown
+# Performance Regression Analysis Report
+
+**Generated:** 2025-09-29 12:34:56
+**Threshold:** 5.0%
+
+## Summary
+- Total Benchmarks Compared: 10
+- Regressions Found: 2
+- Improvements Found: 3
+- Stable Benchmarks: 5
+
+## ⚠️ Performance Regressions Detected
+
+### TinyLlama-1.1B | backend:CPU | p:512 | g:128
+
+⚠️ **Average Tokens/Second**:
+- Baseline: 45.23 tokens/s
+- Current: 42.15 tokens/s
+- Change: ↓ 6.81%
+
+...
+```
+
+### Memory Leak Report Format
+
+```markdown
+# Memory Leak Monitoring Report
+
+**Generated:** 2025-09-29 12:34:56
+
+## ⚠️ Memory Leaks Detected
+
+### benchmark
+- Initial Memory: 1234.56 MB
+- Final Memory: 1250.78 MB
+- Leaked: 16.22 MB
+```
+
+## Troubleshooting
+
+### No Baseline Available
+
+If the baseline cache is empty or expired:
+
+1. The workflow will attempt to build the baseline from the base commit
+2. If that fails, it will create a baseline from the current code
+3. Subsequent runs will use this baseline
+
+### False Positives
+
+Regressions can be marked as false positives in the database:
+
+```sql
+UPDATE regression_alerts
+SET status = 'false_positive', notes = 'Expected due to architectural change'
+WHERE id = <alert_id>;
+```
+
+### Excessive Memory Usage Warnings
+
+If memory usage exceeds thresholds:
+
+1. Review the memory leak report
+2. Check for memory leaks using valgrind or similar tools
+3. Adjust the threshold if legitimate increased usage
+
+## Integration with CI/CD
+
+### GitHub Actions Artifacts
+
+The workflow uploads artifacts containing:
+- Regression reports (markdown)
+- SQLite databases (baseline and current)
+- Memory leak reports
+
+**Downloading Artifacts:**
+```bash
+gh run download <run-id> -n performance-report-cpu
+```
+
+### PR Comments
+
+The workflow automatically comments on PRs with:
+- Summary of regression detection
+- Links to detailed reports
+- Pass/fail status
+
+### Build Status
+
+The workflow sets the build status to:
+- ✅ **Success**: No regressions detected
+- ❌ **Failure**: Regressions exceed threshold
+- ⚠️ **Warning**: Issues detected but below threshold
+
+## Best Practices
+
+1. **Run locally before PR**: Test performance changes locally
+2. **Review memory reports**: Check for memory leaks regularly
+3. **Update baselines**: Refresh baselines after major changes
+4. **Monitor trends**: Use historical data to identify gradual degradation
+5. **Document exceptions**: Note expected performance changes in PR descriptions
+
+## Future Enhancements
+
+Potential improvements to the system:
+
+- [ ] Add GPU-specific benchmarks when runners available
+- [ ] Implement trend analysis over multiple commits
+- [ ] Add visualization dashboard for historical performance
+- [ ] Support for custom benchmark configurations per PR
+- [ ] Integration with performance profiling tools
+- [ ] Automatic bisection for regression identification
+- [ ] Multi-model benchmark comparisons
+
+## References
+
+- [llama-bench documentation](../tools/llama-bench/README.md)
+- [compare-llama-bench.py usage](../scripts/compare-llama-bench.py)
+- [llama-memory.h interface](../src/llama-memory.h)
+- [GitHub Actions workflows](../.github/workflows/)
+
+## Support
+
+For issues or questions:
+- Check existing GitHub issues
+- Review workflow run logs
+- Examine generated reports
+- Contact the performance testing team
diff --git a/scripts/apply-db-migration.py b/scripts/apply-db-migration.py
new file mode 100755
index 0000000000000..479517dcbae02
--- /dev/null
+++ b/scripts/apply-db-migration.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Database Schema Migration Tool for llama.cpp Performance Testing
+
+This script applies schema migrations to extend the existing llama-bench
+SQLite database with baseline tracking, historical data, and regression alerting.
+"""
+
+import argparse
+import logging
+import os
+import sqlite3
+import sys
+from pathlib import Path
+
+logger = logging.getLogger("apply-db-migration")
+
+
+def apply_migration(db_path: str, migration_sql_path: str, dry_run: bool = False) -> bool:
+    """
+    Apply database schema migration.
+
+    Args:
+        db_path: Path to SQLite database
+        migration_sql_path: Path to SQL migration script
+        dry_run: If True, print migration without applying
+
+    Returns:
+        True if successful, False otherwise
+    """
+    if not os.path.exists(migration_sql_path):
+        logger.error(f"Migration script not found: {migration_sql_path}")
+        return False
+
+    with open(migration_sql_path, 'r') as f:
+        migration_sql = f.read()
+
+    if dry_run:
+        logger.info("Dry run mode - migration would execute:")
+        logger.info(migration_sql)
+        return True
+
+    try:
+        conn = sqlite3.connect(db_path)
+        cursor = conn.cursor()
+
+        cursor.executescript(migration_sql)
+        conn.commit()
+
+        logger.info(f"Migration applied successfully to {db_path}")
+
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
+        tables = [row[0] for row in cursor.fetchall()]
+        logger.info(f"Database tables: {', '.join(tables)}")
+
+        conn.close()
+        return True
+
+    except sqlite3.Error as e:
+        logger.error(f"Migration failed: {e}")
+        return False
+
+
+def check_migration_status(db_path: str) -> dict:
+    """
+    Check if migration has been applied to the database.
+
+    Args:
+        db_path: Path to SQLite database
+
+    Returns:
+        Dictionary with migration status information
+    """
+    if not os.path.exists(db_path):
+        return {"exists": False, "migrated": False, "tables": []}
+
+    try:
+        conn = sqlite3.connect(db_path)
+        cursor = conn.cursor()
+
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
+        tables = [row[0] for row in cursor.fetchall()]
+
+        migration_tables = [
+            "performance_baselines",
+            "performance_history",
+            "regression_alerts",
+            "memory_leak_logs"
+        ]
+
+        migrated = all(table in tables for table in migration_tables)
+
+        conn.close()
+
+        return {
+            "exists": True,
+            "migrated": migrated,
+            "tables": tables,
+            "migration_tables_present": [t for t in migration_tables if t in tables]
+        }
+
+    except sqlite3.Error as e:
+        logger.error(f"Error checking database: {e}")
+        return {"exists": True, "migrated": False, "error": str(e)}
+
+
+def main():
+    """Main entry point for migration tool."""
+    parser = argparse.ArgumentParser(
+        description="Apply database schema migrations for performance testing"
+    )
+    parser.add_argument(
+        "--database",
+        "-d",
+        required=True,
+        help="Path to SQLite database"
+    )
+    parser.add_argument(
+        "--migration",
+        "-m",
+        help="Path to migration SQL script (default: scripts/db-schema-migration.sql)"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print migration without applying"
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Check migration status without applying"
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+
+    if not args.migration:
+        script_dir = Path(__file__).parent
+        args.migration = script_dir / "db-schema-migration.sql"
+
+    if args.check:
+        status = check_migration_status(args.database)
+        logger.info(f"Database exists: {status.get('exists', False)}")
+        logger.info(f"Migration applied: {status.get('migrated', False)}")
+        if status.get('tables'):
+            logger.info(f"Tables present: {', '.join(status['tables'])}")
+        if status.get('migration_tables_present'):
+            logger.info(f"Migration tables: {', '.join(status['migration_tables_present'])}")
+        sys.exit(0 if status.get('migrated', False) else 1)
+
+    success = apply_migration(args.database, str(args.migration), args.dry_run)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index c45c83fdb55c3..9c0cc7833bbf6 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -9,6 +9,7 @@
 import sqlite3
 import sys
 from collections.abc import Iterator, Sequence
+from datetime import datetime
 from glob import glob
 from typing import Any, Optional, Union
 
@@ -175,6 +176,11 @@
 parser.add_argument("--plot_x", help="parameter to use as x axis for plotting (default: n_depth)", default="n_depth")
 parser.add_argument("--plot_log_scale", action="store_true", help="use log scale for x axis in plots (off by default)")
 
+parser.add_argument("--ci-mode", action="store_true", help="Enable CI mode for automated workflows")
+parser.add_argument("--baseline-db", help="Path to baseline database for tracking performance over time")
+parser.add_argument("--save-baseline", help="Save current results as baseline to specified database path")
+parser.add_argument("--json-output", help="Export comparison results to JSON file for automated processing")
+
 known_args, unknown_args = parser.parse_known_args()
 
 logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
@@ -1091,3 +1097,23 @@ def make_axes(num_groups, max_cols=2, base_size=(8, 4)):
     floatfmt=".2f",
     tablefmt=known_args.output
 ))
+
+if known_args.json_output:
+    output_data = {
+        "baseline": name_baseline,
+        "compare": name_compare,
+        "tool": tool,
+        "headers": headers,
+        "table": table,
+        "timestamp": datetime.now().isoformat()
+    }
+    with open(known_args.json_output, "w") as f:
+        json.dump(output_data, f, indent=2, default=str)
+    logger.info(f"JSON output written to {known_args.json_output}")
+
+if known_args.save_baseline:
+    import shutil
+    if input_file:
+        baseline_path = known_args.save_baseline
+        shutil.copy(input_file[0], baseline_path)
+        logger.info(f"Baseline saved to {baseline_path}")
diff --git a/scripts/db-schema-migration.sql b/scripts/db-schema-migration.sql
new file mode 100644
index 0000000000000..acd3da348eedd
--- /dev/null
+++ b/scripts/db-schema-migration.sql
@@ -0,0 +1,116 @@
+
+CREATE TABLE IF NOT EXISTS performance_baselines (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    baseline_name TEXT NOT NULL,
+    commit_sha TEXT NOT NULL,
+    branch_name TEXT DEFAULT 'master',
+    created_at TEXT NOT NULL,
+    description TEXT,
+    is_active INTEGER DEFAULT 1,
+    UNIQUE(baseline_name, commit_sha)
+);
+
+CREATE TABLE IF NOT EXISTS performance_history (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    test_time TEXT NOT NULL,
+    build_commit TEXT NOT NULL,
+    model_type TEXT,
+    backends TEXT,
+    n_gpu_layers INTEGER,
+    avg_ts REAL,
+    avg_ns INTEGER,
+    stddev_ts REAL,
+    stddev_ns INTEGER,
+    cpu_info TEXT,
+    gpu_info TEXT,
+    n_threads INTEGER,
+    n_prompt INTEGER,
+    n_gen INTEGER,
+    memory_usage_kb INTEGER,
+    memory_status TEXT,
+    FOREIGN KEY (build_commit) REFERENCES performance_baselines(commit_sha)
+);
+
+CREATE TABLE IF NOT EXISTS regression_alerts (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    alert_time TEXT NOT NULL,
+    baseline_commit TEXT NOT NULL,
+    current_commit TEXT NOT NULL,
+    benchmark_key TEXT NOT NULL,
+    metric_name TEXT NOT NULL,
+    baseline_value REAL NOT NULL,
+    current_value REAL NOT NULL,
+    change_percentage REAL NOT NULL,
+    threshold_percentage REAL NOT NULL,
+    severity TEXT CHECK(severity IN ('warning', 'critical')) DEFAULT 'warning',
+    status TEXT CHECK(status IN ('open', 'investigating', 'resolved', 'false_positive')) DEFAULT 'open',
+    notes TEXT
+);
+
+CREATE TABLE IF NOT EXISTS memory_leak_logs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    test_time TEXT NOT NULL,
+    build_commit TEXT NOT NULL,
+    test_name TEXT NOT NULL,
+    memory_status TEXT NOT NULL,
+    initial_memory_kb INTEGER,
+    final_memory_kb INTEGER,
+    peak_memory_kb INTEGER,
+    leaked_memory_kb INTEGER,
+    status_code INTEGER,
+    error_message TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_performance_history_commit ON performance_history(build_commit);
+CREATE INDEX IF NOT EXISTS idx_performance_history_time ON performance_history(test_time);
+CREATE INDEX IF NOT EXISTS idx_performance_history_model ON performance_history(model_type);
+CREATE INDEX IF NOT EXISTS idx_regression_alerts_time ON regression_alerts(alert_time);
+CREATE INDEX IF NOT EXISTS idx_regression_alerts_status ON regression_alerts(status);
+CREATE INDEX IF NOT EXISTS idx_memory_leak_logs_commit ON memory_leak_logs(build_commit);
+CREATE INDEX IF NOT EXISTS idx_memory_leak_logs_time ON memory_leak_logs(test_time);
+
+CREATE VIEW IF NOT EXISTS latest_baselines AS
+SELECT
+    b.baseline_name,
+    b.commit_sha,
+    b.branch_name,
+    b.created_at,
+    COUNT(h.id) as benchmark_count
+FROM performance_baselines b
+LEFT JOIN performance_history h ON b.commit_sha = h.build_commit
+WHERE b.is_active = 1
+GROUP BY b.id
+ORDER BY b.created_at DESC;
+
+CREATE VIEW IF NOT EXISTS regression_summary AS
+SELECT
+    current_commit,
+    COUNT(*) as total_regressions,
+    SUM(CASE WHEN severity = 'critical' THEN 1 ELSE 0 END) as critical_count,
+    SUM(CASE WHEN severity = 'warning' THEN 1 ELSE 0 END) as warning_count,
+    AVG(ABS(change_percentage)) as avg_degradation
+FROM regression_alerts
+WHERE status = 'open'
+GROUP BY current_commit
+ORDER BY total_regressions DESC;
+
+CREATE VIEW IF NOT EXISTS memory_leak_summary AS
+SELECT
+    build_commit,
+    COUNT(*) as total_tests,
+    SUM(CASE WHEN memory_status = 'LLAMA_MEMORY_STATUS_SUCCESS' THEN 1 ELSE 0 END) as passed_tests,
+    SUM(CASE WHEN leaked_memory_kb > 0 THEN 1 ELSE 0 END) as leak_detected,
+    SUM(leaked_memory_kb) as total_leaked_kb
+FROM memory_leak_logs
+GROUP BY build_commit
+ORDER BY test_time DESC;
+
+CREATE TRIGGER IF NOT EXISTS update_regression_alert_timestamp
+AFTER UPDATE ON regression_alerts
+FOR EACH ROW
+WHEN OLD.status != NEW.status
+BEGIN
+    UPDATE regression_alerts
+    SET alert_time = datetime('now')
+    WHERE id = NEW.id;
+END;
diff --git a/scripts/memory-leak-monitor.py b/scripts/memory-leak-monitor.py
new file mode 100755
index 0000000000000..cf57a21cbdabf
--- /dev/null
+++ b/scripts/memory-leak-monitor.py
@@ -0,0 +1,383 @@
+#!/usr/bin/env python3
+"""
+Memory Leak Monitoring Integration for llama.cpp
+
+This script integrates with the CI pipeline to monitor memory consumption
+patterns using the existing llama_memory_status interfaces from llama-memory.h.
+
+It parses benchmark results and test logs to detect memory leaks and excessive
+memory consumption that could indicate performance issues.
+"""
+
+import argparse
+import logging
+import os
+import re
+import sqlite3
+import sys
+from datetime import datetime
+from typing import Dict, List, Optional
+
+logger = logging.getLogger("memory-leak-monitor")
+
+MEMORY_STATUS_CODES = {
+    0: "LLAMA_MEMORY_STATUS_SUCCESS",
+    1: "LLAMA_MEMORY_STATUS_NO_UPDATE",
+    2: "LLAMA_MEMORY_STATUS_FAILED_PREPARE",
+    3: "LLAMA_MEMORY_STATUS_FAILED_COMPUTE",
+}
+
+LEAK_THRESHOLD_KB = 1024  # 1 MB leak threshold
+EXCESSIVE_MEMORY_THRESHOLD_GB = 16  # 16 GB excessive usage threshold
+
+
+class MemoryLeakMonitor:
+    """Monitor memory usage and detect potential leaks."""
+
+    def __init__(self, db_path: Optional[str] = None):
+        """
+        Initialize memory leak monitor.
+
+        Args:
+            db_path: Optional path to SQLite database for storing results
+        """
+        self.db_path = db_path
+        self.leaks_detected: List[Dict] = []
+        self.memory_issues: List[Dict] = []
+
+    def parse_benchmark_output(self, output_file: str) -> List[Dict]:
+        """
+        Parse benchmark output for memory usage information.
+
+        Args:
+            output_file: Path to benchmark output file
+
+        Returns:
+            List of memory usage records
+        """
+        memory_records = []
+
+        if not os.path.exists(output_file):
+            logger.warning(f"Output file not found: {output_file}")
+            return memory_records
+
+        with open(output_file, 'r') as f:
+            content = f.read()
+
+        size_pattern = r'model size:\s+(\d+\.?\d*)\s+(GiB|MiB|GB|MB)'
+        usage_pattern = r'memory usage:\s+(\d+)\s+(MB|KB|GB)'
+        peak_pattern = r'peak memory:\s+(\d+\.?\d*)\s+(GB|MB)'
+
+        for pattern_name, pattern in [
+            ("model_size", size_pattern),
+            ("memory_usage", usage_pattern),
+            ("peak_memory", peak_pattern)
+        ]:
+            matches = re.finditer(pattern, content, re.IGNORECASE)
+            for match in matches:
+                value = float(match.group(1))
+                unit = match.group(2).upper()
+
+                if unit in ["GIB", "GB"]:
+                    value_kb = value * 1024 * 1024
+                elif unit in ["MIB", "MB"]:
+                    value_kb = value * 1024
+                else:
+                    value_kb = value
+
+                memory_records.append({
+                    "type": pattern_name,
+                    "value_kb": value_kb,
+                    "original_value": match.group(1),
+                    "unit": match.group(2)
+                })
+
+        logger.info(f"Parsed {len(memory_records)} memory records from {output_file}")
+        return memory_records
+
+    def parse_test_logs(self, log_file: str) -> List[Dict]:
+        """
+        Parse test logs for memory status codes.
+
+        Args:
+            log_file: Path to test log file
+
+        Returns:
+            List of memory status records
+        """
+        status_records = []
+
+        if not os.path.exists(log_file):
+            logger.warning(f"Log file not found: {log_file}")
+            return status_records
+
+        with open(log_file, 'r') as f:
+            lines = f.readlines()
+
+        status_pattern = r'memory.*status[:\s]+(\d+)'
+        failure_pattern = r'memory.*(?:leak|fail|error)'
+
+        for i, line in enumerate(lines):
+            status_match = re.search(status_pattern, line, re.IGNORECASE)
+            if status_match:
+                status_code = int(status_match.group(1))
+                status_name = MEMORY_STATUS_CODES.get(status_code, "UNKNOWN")
+
+                status_records.append({
+                    "line_number": i + 1,
+                    "status_code": status_code,
+                    "status_name": status_name,
+                    "line": line.strip(),
+                    "is_failure": status_code >= 2
+                })
+
+            failure_match = re.search(failure_pattern, line, re.IGNORECASE)
+            if failure_match:
+                status_records.append({
+                    "line_number": i + 1,
+                    "status_code": -1,
+                    "status_name": "MEMORY_ISSUE_DETECTED",
+                    "line": line.strip(),
+                    "is_failure": True
+                })
+
+        logger.info(f"Parsed {len(status_records)} memory status records from {log_file}")
+        return status_records
+
+    def detect_leaks(
+        self,
+        initial_memory_kb: float,
+        final_memory_kb: float,
+        test_name: str = "unknown"
+    ) -> Optional[Dict]:
+        """
+        Detect memory leaks by comparing initial and final memory usage.
+
+        Args:
+            initial_memory_kb: Initial memory usage in KB
+            final_memory_kb: Final memory usage in KB
+            test_name: Name of the test
+
+        Returns:
+            Leak information if detected, None otherwise
+        """
+        leaked_kb = final_memory_kb - initial_memory_kb
+
+        if leaked_kb > LEAK_THRESHOLD_KB:
+            leak_info = {
+                "test_name": test_name,
+                "initial_memory_kb": initial_memory_kb,
+                "final_memory_kb": final_memory_kb,
+                "leaked_memory_kb": leaked_kb,
+                "leaked_memory_mb": leaked_kb / 1024,
+                "timestamp": datetime.now().isoformat()
+            }
+            self.leaks_detected.append(leak_info)
+            logger.warning(f"Memory leak detected in {test_name}: {leak_info['leaked_memory_mb']:.2f} MB")
+            return leak_info
+
+        return None
+
+    def check_excessive_usage(self, memory_kb: float, test_name: str = "unknown") -> bool:
+        """
+        Check if memory usage exceeds acceptable thresholds.
+
+        Args:
+            memory_kb: Memory usage in KB
+            test_name: Name of the test
+
+        Returns:
+            True if excessive usage detected
+        """
+        memory_gb = memory_kb / (1024 * 1024)
+
+        if memory_gb > EXCESSIVE_MEMORY_THRESHOLD_GB:
+            issue = {
+                "test_name": test_name,
+                "memory_kb": memory_kb,
+                "memory_gb": memory_gb,
+                "threshold_gb": EXCESSIVE_MEMORY_THRESHOLD_GB,
+                "timestamp": datetime.now().isoformat()
+            }
+            self.memory_issues.append(issue)
+            logger.warning(
+                f"Excessive memory usage in {test_name}: "
+                f"{memory_gb:.2f} GB (threshold: {EXCESSIVE_MEMORY_THRESHOLD_GB} GB)"
+            )
+            return True
+
+        return False
+
+    def store_results(self, build_commit: str):
+        """
+        Store memory monitoring results in database.
+
+        Args:
+            build_commit: Git commit SHA
+        """
+        if not self.db_path:
+            logger.warning("No database path configured, skipping storage")
+            return
+
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS memory_leak_logs (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    test_time TEXT NOT NULL,
+                    build_commit TEXT NOT NULL,
+                    test_name TEXT NOT NULL,
+                    memory_status TEXT NOT NULL,
+                    initial_memory_kb INTEGER,
+                    final_memory_kb INTEGER,
+                    peak_memory_kb INTEGER,
+                    leaked_memory_kb INTEGER,
+                    status_code INTEGER,
+                    error_message TEXT
+                )
+            """)
+
+            for leak in self.leaks_detected:
+                cursor.execute("""
+                    INSERT INTO memory_leak_logs (
+                        test_time, build_commit, test_name, memory_status,
+                        initial_memory_kb, final_memory_kb, leaked_memory_kb,
+                        status_code
+                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                """, (
+                    datetime.now().isoformat(),
+                    build_commit,
+                    leak["test_name"],
+                    "LEAK_DETECTED",
+                    int(leak["initial_memory_kb"]),
+                    int(leak["final_memory_kb"]),
+                    int(leak["leaked_memory_kb"]),
+                    -1
+                ))
+
+            for issue in self.memory_issues:
+                cursor.execute("""
+                    INSERT INTO memory_leak_logs (
+                        test_time, build_commit, test_name, memory_status,
+                        peak_memory_kb, status_code
+                    ) VALUES (?, ?, ?, ?, ?, ?)
+                """, (
+                    datetime.now().isoformat(),
+                    build_commit,
+                    issue["test_name"],
+                    "EXCESSIVE_USAGE",
+                    int(issue["memory_kb"]),
+                    -2
+                ))
+
+            conn.commit()
+            conn.close()
+            logger.info(f"Stored {len(self.leaks_detected)} leak records and "
+                        f"{len(self.memory_issues)} excessive usage records")
+
+        except sqlite3.Error as e:
+            logger.error(f"Error storing results: {e}")
+
+    def generate_report(self, output_file: str):
+        """
+        Generate a markdown report of memory monitoring results.
+
+        Args:
+            output_file: Path to output markdown file
+        """
+        with open(output_file, 'w') as f:
+            f.write("# Memory Leak Monitoring Report\n\n")
+            f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+
+            if self.leaks_detected:
+                f.write("## ⚠️ Memory Leaks Detected\n\n")
+                f.write(f"**Total Leaks:** {len(self.leaks_detected)}\n\n")
+
+                for leak in self.leaks_detected:
+                    f.write(f"### {leak['test_name']}\n\n")
+                    f.write(f"- **Initial Memory:** {leak['initial_memory_kb'] / 1024:.2f} MB\n")
+                    f.write(f"- **Final Memory:** {leak['final_memory_kb'] / 1024:.2f} MB\n")
+                    f.write(f"- **Leaked:** {leak['leaked_memory_mb']:.2f} MB\n\n")
+            else:
+                f.write("## ✅ No Memory Leaks Detected\n\n")
+
+            if self.memory_issues:
+                f.write("## ⚠️ Excessive Memory Usage\n\n")
+                f.write(f"**Total Issues:** {len(self.memory_issues)}\n\n")
+
+                for issue in self.memory_issues:
+                    f.write(f"### {issue['test_name']}\n\n")
+                    f.write(f"- **Memory Used:** {issue['memory_gb']:.2f} GB\n")
+                    f.write(f"- **Threshold:** {issue['threshold_gb']} GB\n\n")
+
+        logger.info(f"Report written to {output_file}")
+
+
+def main():
+    """Main entry point for memory leak monitor."""
+    parser = argparse.ArgumentParser(
+        description="Monitor memory usage and detect leaks in llama.cpp benchmarks"
+    )
+    parser.add_argument(
+        "--benchmark-output",
+        help="Path to benchmark output file to analyze"
+    )
+    parser.add_argument(
+        "--test-log",
+        help="Path to test log file to analyze"
+    )
+    parser.add_argument(
+        "--database",
+        help="Path to SQLite database for storing results"
+    )
+    parser.add_argument(
+        "--commit",
+        default="unknown",
+        help="Git commit SHA for this run"
+    )
+    parser.add_argument(
+        "--report",
+        default="memory-report.md",
+        help="Output path for memory report"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+
+    monitor = MemoryLeakMonitor(db_path=args.database)
+
+    if args.benchmark_output:
+        memory_records = monitor.parse_benchmark_output(args.benchmark_output)
+        if len(memory_records) >= 2:
+            initial = memory_records[0]["value_kb"]
+            final = memory_records[-1]["value_kb"]
+            monitor.detect_leaks(initial, final, "benchmark")
+
+    if args.test_log:
+        status_records = monitor.parse_test_logs(args.test_log)
+        for record in status_records:
+            if record.get("is_failure"):
+                logger.error(f"Memory failure at line {record['line_number']}: {record['line']}")
+
+    if args.database:
+        monitor.store_results(args.commit)
+
+    monitor.generate_report(args.report)
+
+    has_issues = bool(monitor.leaks_detected or monitor.memory_issues)
+    sys.exit(1 if has_issues else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/performance-regression-detector.py b/scripts/performance-regression-detector.py
new file mode 100755
index 0000000000000..30efd55214476
--- /dev/null
+++ b/scripts/performance-regression-detector.py
@@ -0,0 +1,407 @@
+#!/usr/bin/env python3
+"""
+Performance Regression Detector for llama.cpp
+
+This script compares benchmark results between baseline and current runs,
+detecting performance regressions above a configurable threshold.
+
+It integrates with the existing llama-bench SQLite database schema and
+provides automated alerts for CI/CD pipelines.
+"""
+
+import argparse
+import json
+import logging
+import os
+import sqlite3
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+LLAMA_BENCH_DB_FIELDS = [
+    "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
+    "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
+    "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
+    "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
+    "use_mmap",     "embeddings",   "no_op_offload",  "n_prompt",   "n_gen",        "n_depth",
+    "test_time",    "avg_ns",       "stddev_ns",      "avg_ts",     "stddev_ts",
+]
+
+BENCHMARK_KEY_PROPERTIES = [
+    "model_type", "n_batch", "n_ubatch", "n_threads", "n_gpu_layers",
+    "backends", "n_prompt", "n_gen", "flash_attn"
+]
+
+PERFORMANCE_METRICS = {
+    "avg_ts": {
+        "name": "Average Tokens/Second",
+        "unit": "tokens/s",
+        "direction": "higher_is_better",
+        "format": "{:.2f}"
+    },
+    "avg_ns": {
+        "name": "Average Latency",
+        "unit": "ns",
+        "direction": "lower_is_better",
+        "format": "{:.0f}"
+    },
+    "model_size": {
+        "name": "Model Size",
+        "unit": "bytes",
+        "direction": "lower_is_better",
+        "format": "{:.0f}"
+    }
+}
+
+logger = logging.getLogger("performance-regression-detector")
+
+
+class RegressionDetector:
+    """Detects performance regressions by comparing benchmark results."""
+
+    def __init__(self, baseline_db: str, current_db: str, threshold: float = 5.0):
+        """
+        Initialize the regression detector.
+
+        Args:
+            baseline_db: Path to baseline SQLite database
+            current_db: Path to current run SQLite database
+            threshold: Regression threshold percentage (default: 5.0)
+        """
+        self.baseline_db = baseline_db
+        self.current_db = current_db
+        self.threshold = threshold
+        self.regressions: List[Dict[str, Any]] = []
+        self.improvements: List[Dict[str, Any]] = []
+        self.stable: List[Dict[str, Any]] = []
+
+    def load_results(self, db_path: str) -> List[Dict[str, Any]]:
+        """Load benchmark results from SQLite database."""
+        if not os.path.exists(db_path):
+            logger.warning(f"Database not found: {db_path}")
+            return []
+
+        conn = sqlite3.connect(db_path)
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        try:
+            cursor.execute("SELECT * FROM test")
+            results = [dict(row) for row in cursor.fetchall()]
+            logger.info(f"Loaded {len(results)} results from {db_path}")
+            return results
+        except sqlite3.OperationalError as e:
+            logger.error(f"Error reading database {db_path}: {e}")
+            return []
+        finally:
+            conn.close()
+
+    def match_benchmark(
+        self, baseline: Dict[str, Any], current_results: List[Dict[str, Any]]
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Find matching benchmark in current results based on key properties.
+
+        Args:
+            baseline: Baseline benchmark result
+            current_results: List of current benchmark results
+
+        Returns:
+            Matching benchmark or None if no match found
+        """
+        for current in current_results:
+            match = True
+            for key in BENCHMARK_KEY_PROPERTIES:
+                if key not in baseline or key not in current:
+                    continue
+                if baseline[key] != current[key]:
+                    match = False
+                    break
+            if match:
+                return current
+        return None
+
+    def calculate_regression(
+        self, metric_name: str, baseline_value: float, current_value: float
+    ) -> Tuple[float, bool]:
+        """
+        Calculate regression percentage and determine if it exceeds threshold.
+
+        Args:
+            metric_name: Name of the metric being compared
+            baseline_value: Baseline metric value
+            current_value: Current metric value
+
+        Returns:
+            Tuple of (change_percentage, is_regression)
+        """
+        if baseline_value == 0:
+            return 0.0, False
+
+        metric_info = PERFORMANCE_METRICS.get(metric_name, {})
+        direction = metric_info.get("direction", "higher_is_better")
+
+        change_pct = ((current_value - baseline_value) / baseline_value) * 100
+
+        if direction == "higher_is_better":
+            is_regression = change_pct < -self.threshold
+        else:
+            is_regression = change_pct > self.threshold
+
+        return change_pct, is_regression
+
+    def analyze(self) -> Dict[str, Any]:
+        """
+        Analyze benchmark results and detect regressions.
+
+        Returns:
+            Dictionary containing analysis results
+        """
+        logger.info("Starting regression analysis...")
+
+        baseline_results = self.load_results(self.baseline_db)
+        current_results = self.load_results(self.current_db)
+
+        if not baseline_results:
+            logger.warning("No baseline results found - skipping comparison")
+            return {
+                "status": "no_baseline",
+                "message": "No baseline results available for comparison",
+                "regressions": [],
+                "improvements": [],
+                "stable": [],
+                "summary": {
+                    "total_benchmarks": 0,
+                    "regressions_found": 0,
+                    "improvements_found": 0,
+                    "stable_benchmarks": 0
+                }
+            }
+
+        if not current_results:
+            logger.error("No current results found")
+            return {
+                "status": "error",
+                "message": "No current results found",
+                "regressions": [],
+                "improvements": [],
+                "stable": [],
+                "summary": {
+                    "total_benchmarks": len(baseline_results),
+                    "regressions_found": 0,
+                    "improvements_found": 0,
+                    "stable_benchmarks": 0
+                }
+            }
+
+        for baseline in baseline_results:
+            current = self.match_benchmark(baseline, current_results)
+            if not current:
+                logger.debug(f"No matching current result for baseline: {baseline.get('model_type')}")
+                continue
+
+            benchmark_key = self._generate_benchmark_key(baseline)
+            has_regression = False
+            has_improvement = False
+            changes = {}
+
+            for metric_name in ["avg_ts", "avg_ns"]:
+                if metric_name not in baseline or metric_name not in current:
+                    continue
+
+                baseline_value = baseline[metric_name]
+                current_value = current[metric_name]
+
+                if baseline_value is None or current_value is None:
+                    continue
+
+                change_pct, is_regression = self.calculate_regression(
+                    metric_name, baseline_value, current_value
+                )
+
+                metric_info = PERFORMANCE_METRICS[metric_name]
+                changes[metric_name] = {
+                    "baseline": baseline_value,
+                    "current": current_value,
+                    "change_pct": change_pct,
+                    "is_regression": is_regression,
+                    "unit": metric_info["unit"],
+                    "name": metric_info["name"]
+                }
+
+                if is_regression:
+                    has_regression = True
+                elif abs(change_pct) > self.threshold:
+                    has_improvement = True
+
+            result = {
+                "benchmark_key": benchmark_key,
+                "baseline": baseline,
+                "current": current,
+                "changes": changes
+            }
+
+            if has_regression:
+                self.regressions.append(result)
+            elif has_improvement:
+                self.improvements.append(result)
+            else:
+                self.stable.append(result)
+
+        status = "regression" if self.regressions else "pass"
+
+        return {
+            "status": status,
+            "threshold": self.threshold,
+            "regressions": self.regressions,
+            "improvements": self.improvements,
+            "stable": self.stable,
+            "summary": {
+                "total_benchmarks": len(baseline_results),
+                "regressions_found": len(self.regressions),
+                "improvements_found": len(self.improvements),
+                "stable_benchmarks": len(self.stable)
+            }
+        }
+
+    def _generate_benchmark_key(self, benchmark: Dict[str, Any]) -> str:
+        """Generate a human-readable key for a benchmark."""
+        parts = []
+        if "model_type" in benchmark:
+            parts.append(benchmark["model_type"])
+        if "backends" in benchmark:
+            parts.append(f"backend:{benchmark['backends']}")
+        if "n_gpu_layers" in benchmark and benchmark["n_gpu_layers"]:
+            parts.append(f"ngl:{benchmark['n_gpu_layers']}")
+        if "n_prompt" in benchmark:
+            parts.append(f"p:{benchmark['n_prompt']}")
+        if "n_gen" in benchmark:
+            parts.append(f"g:{benchmark['n_gen']}")
+        return " | ".join(parts) if parts else "unknown"
+
+    def generate_report(self, output_path: str, analysis: Dict[str, Any]):
+        """Generate a markdown report of the regression analysis."""
+        with open(output_path, "w") as f:
+            f.write("# Performance Regression Analysis Report\n\n")
+            f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+            f.write(f"**Threshold:** {self.threshold}%\n\n")
+
+            summary = analysis["summary"]
+            f.write("## Summary\n\n")
+            f.write(f"- **Total Benchmarks Compared:** {summary['total_benchmarks']}\n")
+            f.write(f"- **Regressions Found:** {summary['regressions_found']}\n")
+            f.write(f"- **Improvements Found:** {summary['improvements_found']}\n")
+            f.write(f"- **Stable Benchmarks:** {summary['stable_benchmarks']}\n\n")
+
+            if analysis["status"] == "regression":
+                f.write("## ⚠️ Performance Regressions Detected\n\n")
+                for reg in analysis["regressions"]:
+                    self._write_benchmark_section(f, reg, "Regression")
+            elif analysis["status"] == "no_baseline":
+                f.write("## ℹ️ No Baseline Available\n\n")
+                f.write(analysis["message"] + "\n\n")
+            else:
+                f.write("## ✅ No Performance Regressions Detected\n\n")
+
+            if analysis["improvements"]:
+                f.write("## 📈 Performance Improvements\n\n")
+                for imp in analysis["improvements"]:
+                    self._write_benchmark_section(f, imp, "Improvement")
+
+            if analysis.get("stable"):
+                f.write("## 📊 Stable Performance\n\n")
+                f.write(f"**{len(analysis['stable'])} benchmarks** showed stable performance ")
+                f.write(f"(within ±{self.threshold}% threshold).\n\n")
+
+        logger.info(f"Report written to {output_path}")
+
+    def _write_benchmark_section(self, f, result: Dict[str, Any], section_type: str):
+        """Write a benchmark comparison section to the report."""
+        f.write(f"### {result['benchmark_key']}\n\n")
+
+        for metric_name, change in result["changes"].items():
+            if not change.get("is_regression") and section_type == "Regression":
+                continue
+            if change.get("is_regression") and section_type == "Improvement":
+                continue
+
+            baseline_val = change["baseline"]
+            current_val = change["current"]
+            change_pct = change["change_pct"]
+            unit = change["unit"]
+            name = change["name"]
+
+            icon = "⚠️" if change.get("is_regression") else "✅"
+            direction = "↓" if change_pct < 0 else "↑"
+
+            f.write(f"{icon} **{name}**:\n")
+            f.write(f"- Baseline: {baseline_val:.2f} {unit}\n")
+            f.write(f"- Current: {current_val:.2f} {unit}\n")
+            f.write(f"- Change: {direction} {abs(change_pct):.2f}%\n\n")
+
+
+def main():
+    """Main entry point for the regression detector."""
+    parser = argparse.ArgumentParser(
+        description="Detect performance regressions in llama.cpp benchmarks"
+    )
+    parser.add_argument(
+        "--baseline",
+        required=True,
+        help="Path to baseline SQLite database"
+    )
+    parser.add_argument(
+        "--current",
+        required=True,
+        help="Path to current run SQLite database"
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=5.0,
+        help="Regression threshold percentage (default: 5.0)"
+    )
+    parser.add_argument(
+        "--output",
+        default="regression-report.md",
+        help="Output path for regression report (default: regression-report.md)"
+    )
+    parser.add_argument(
+        "--json-output",
+        help="Optional JSON output path for machine-readable results"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+
+    detector = RegressionDetector(args.baseline, args.current, args.threshold)
+    analysis = detector.analyze()
+
+    detector.generate_report(args.output, analysis)
+
+    if args.json_output:
+        with open(args.json_output, "w") as f:
+            json.dump(analysis, f, indent=2, default=str)
+        logger.info(f"JSON report written to {args.json_output}")
+
+    if analysis["status"] == "regression":
+        Path("regression-detected.flag").touch()
+        logger.error(f"Performance regression detected: {len(analysis['regressions'])} benchmarks affected")
+        sys.exit(1)
+    else:
+        logger.info("No performance regressions detected")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 91719577564a9..dd183734c7c32 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -219,3 +219,14 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
 target_link_libraries(${TEST_TARGET} PRIVATE llama)
+
+# Performance regression test suite
+if (EXISTS ${CMAKE_SOURCE_DIR}/tools/llama-bench/llama-bench.cpp)
+    llama_test_cmd(
+        ${CMAKE_BINARY_DIR}/bin/llama-bench
+        NAME test-performance-regression-cpu
+        LABEL "performance"
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+        ARGS -p 512 -n 128 -r 3 -o sql
+    )
+endif()