Automated Performance Regression Testing System (AT-105) #2

Workflow file for this run

.github/workflows/performance-regression.yml at df5e290

	name: Performance Regression Testing

	on:
	workflow_dispatch: # allows manual triggering
	pull_request:
	types: [opened, synchronize, reopened]
	paths: [
	'.github/workflows/performance-regression.yml',
	'**/CMakeLists.txt',
	'**/.cmake',
	'*/.h',
	'*/.hpp',
	'*/.c',
	'*/.cpp',
	'*/.cu',
	'*/.cuh',
	'tools/llama-bench/**',
	'scripts/performance-regression-detector.py',
	'scripts/compare-llama-bench.py'
	]
	push:
	branches:
	- master
	paths: [
	'.github/workflows/performance-regression.yml',
	'**/CMakeLists.txt',
	'**/.cmake',
	'*/.h',
	'*/.hpp',
	'*/.c',
	'*/.cpp',
	'*/.cu',
	'*/.cuh',
	'tools/llama-bench/**',
	'scripts/performance-regression-detector.py',
	'scripts/compare-llama-bench.py'
	]

	concurrency:
	group: ${{ github.workflow }}-${{ github.head_ref && github.ref \|\| github.run_id }}
	cancel-in-progress: true

	env:
	REGRESSION_THRESHOLD: 5.0
	BASELINE_DB: performance-baseline.sqlite
	RESULTS_DB: performance-results.sqlite

	jobs:
	performance-cpu:
	runs-on: ubuntu-latest

	steps:
	- name: Clone
	uses: actions/checkout@v4
	with:
	fetch-depth: 0 # Full history for baseline comparison

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.x'

	- name: Install Python dependencies
	run: \|
	pip install GitPython tabulate matplotlib

	- name: ccache
	uses: ggml-org/[email protected]
	with:
	key: performance-cpu
	evict-old-files: 1d

	- name: Build llama-bench
	run: \|
	cmake -B build \
	-DCMAKE_BUILD_TYPE=Release \
	-DLLAMA_FATAL_WARNINGS=ON
	cmake --build build --target llama-bench -j $(nproc)

	- name: Download test model
	run: \|
	mkdir -p models
	# Download TinyLlama test model if not present
	if [ ! -f models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ]; then
	./build/bin/llama-cli --hf-repo TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
	--hf-file tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
	--model-download-only
	mv ~/.cache/llama.cpp/*/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf models/
	fi

	- name: Restore baseline database
	id: restore-baseline
	uses: actions/cache/restore@v4
	with:
	path: ${{ env.BASELINE_DB }}
	key: perf-baseline-cpu-${{ github.base_ref \|\| 'master' }}
	restore-keys: \|
	perf-baseline-cpu-

	- name: Run baseline benchmark (if no baseline exists)
	if: steps.restore-baseline.outputs.cache-hit != 'true'
	run: \|
	git checkout ${{ github.event.pull_request.base.sha \|\| github.event.before \|\| 'master' }} \|\| true
	cmake --build build --target llama-bench -j $(nproc) \|\| true
	./build/bin/llama-bench \
	-m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
	-p 512 -n 128 -r 3 \
	-o sql \| sqlite3 ${{ env.BASELINE_DB }} \|\| true
	git checkout -

	- name: Run current benchmark
	run: \|
	./build/bin/llama-bench \
	-m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
	-p 512 -n 128 -r 3 \
	-o sql \| sqlite3 ${{ env.RESULTS_DB }}

	- name: Detect performance regressions
	id: detect-regression
	run: \|
	python scripts/performance-regression-detector.py \
	--baseline ${{ env.BASELINE_DB }} \
	--current ${{ env.RESULTS_DB }} \
	--threshold ${{ env.REGRESSION_THRESHOLD }} \
	--output regression-report.md

	# Set output for subsequent steps
	if [ -f regression-detected.flag ]; then
	echo "regression=true" >> $GITHUB_OUTPUT
	else
	echo "regression=false" >> $GITHUB_OUTPUT
	fi

	- name: Upload regression report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: performance-report-cpu
	path: \|
	regression-report.md
	${{ env.RESULTS_DB }}
	${{ env.BASELINE_DB }}

	- name: Comment on PR with results
	if: github.event_name == 'pull_request' && always()
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	let report = 'Performance Regression Test Results (CPU)\n\n';

	if (fs.existsSync('regression-report.md')) {
	report += fs.readFileSync('regression-report.md', 'utf8');
	} else {
	report += 'No regression report generated.';
	}

	github.rest.issues.createComment({
	issue_number: context.issue.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: report
	});

	- name: Fail if regression detected
	if: steps.detect-regression.outputs.regression == 'true'
	run: \|
	echo "⚠️ Performance regression detected! Check the report for details."
	exit 1

	- name: Save baseline database
	if: github.event_name == 'push' && github.ref == 'refs/heads/master'
	uses: actions/cache/save@v4
	with:
	path: ${{ env.RESULTS_DB }}
	key: perf-baseline-cpu-master-${{ github.sha }}

	performance-cuda:
	runs-on: gpu-runner
	if: false # Disabled by default - enable when GPU runners are available

	steps:
	- name: Clone
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.x'

	- name: Install Python dependencies
	run: \|
	pip install GitPython tabulate matplotlib

	- name: ccache
	uses: ggml-org/[email protected]
	with:
	key: performance-cuda
	evict-old-files: 1d

	- name: Build llama-bench with CUDA
	run: \|
	cmake -B build \
	-DCMAKE_BUILD_TYPE=Release \
	-DGGML_CUDA=ON \
	-DLLAMA_FATAL_WARNINGS=ON
	cmake --build build --target llama-bench -j $(nproc)

	- name: Download test model
	run: \|
	mkdir -p models
	if [ ! -f models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ]; then
	./build/bin/llama-cli --hf-repo TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
	--hf-file tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
	--model-download-only
	mv ~/.cache/llama.cpp/*/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf models/
	fi

	- name: Restore baseline database
	id: restore-baseline-cuda
	uses: actions/cache/restore@v4
	with:
	path: ${{ env.BASELINE_DB }}
	key: perf-baseline-cuda-${{ github.base_ref \|\| 'master' }}
	restore-keys: \|
	perf-baseline-cuda-

	- name: Run baseline benchmark (if no baseline exists)
	if: steps.restore-baseline-cuda.outputs.cache-hit != 'true'
	run: \|
	git checkout ${{ github.event.pull_request.base.sha \|\| github.event.before \|\| 'master' }} \|\| true
	cmake --build build --target llama-bench -j $(nproc) \|\| true
	./build/bin/llama-bench \
	-m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
	-ngl 99 -p 512 -n 128 -r 3 \
	-o sql \| sqlite3 ${{ env.BASELINE_DB }} \|\| true
	git checkout -

	- name: Run current benchmark
	run: \|
	./build/bin/llama-bench \
	-m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
	-ngl 99 -p 512 -n 128 -r 3 \
	-o sql \| sqlite3 ${{ env.RESULTS_DB }}

	- name: Detect performance regressions
	id: detect-regression-cuda
	run: \|
	python scripts/performance-regression-detector.py \
	--baseline ${{ env.BASELINE_DB }} \
	--current ${{ env.RESULTS_DB }} \
	--threshold ${{ env.REGRESSION_THRESHOLD }} \
	--output regression-report-cuda.md

	if [ -f regression-detected.flag ]; then
	echo "regression=true" >> $GITHUB_OUTPUT
	else
	echo "regression=false" >> $GITHUB_OUTPUT
	fi

	- name: Upload regression report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: performance-report-cuda
	path: \|
	regression-report-cuda.md
	${{ env.RESULTS_DB }}
	${{ env.BASELINE_DB }}

	- name: Comment on PR with results
	if: github.event_name == 'pull_request' && always()
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	let report = 'Performance Regression Test Results (CUDA)\n\n';

	if (fs.existsSync('regression-report-cuda.md')) {
	report += fs.readFileSync('regression-report-cuda.md', 'utf8');
	} else {
	report += 'No regression report generated.';
	}

	github.rest.issues.createComment({
	issue_number: context.issue.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: report
	});

	- name: Fail if regression detected
	if: steps.detect-regression-cuda.outputs.regression == 'true'
	run: \|
	echo "⚠️ Performance regression detected! Check the report for details."
	exit 1

	- name: Save baseline database
	if: github.event_name == 'push' && github.ref == 'refs/heads/master'
	uses: actions/cache/save@v4
	with:
	path: ${{ env.RESULTS_DB }}
	key: perf-baseline-cuda-master-${{ github.sha }}

	performance-metal:
	runs-on: macos-14 # macOS with Apple Silicon for Metal testing

	steps:
	- name: Clone
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.x'

	- name: Install Python dependencies
	run: \|
	pip install GitPython tabulate matplotlib

	- name: ccache
	uses: ggml-org/[email protected]
	with:
	key: performance-metal
	evict-old-files: 1d

	- name: Build llama-bench with Metal
	run: \|
	cmake -B build \
	-DCMAKE_BUILD_TYPE=Release \
	-DGGML_METAL=ON \
	-DLLAMA_FATAL_WARNINGS=ON
	cmake --build build --target llama-bench -j $(sysctl -n hw.logicalcpu)

	- name: Download test model
	run: \|
	mkdir -p models
	if [ ! -f models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ]; then
	./build/bin/llama-cli --hf-repo TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
	--hf-file tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
	--model-download-only
	mv ~/.cache/llama.cpp/*/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf models/ \|\| \
	mv ~/Library/Caches/llama.cpp/*/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf models/
	fi

	- name: Restore baseline database
	id: restore-baseline-metal
	uses: actions/cache/restore@v4
	with:
	path: ${{ env.BASELINE_DB }}
	key: perf-baseline-metal-${{ github.base_ref \|\| 'master' }}
	restore-keys: \|
	perf-baseline-metal-

	- name: Run baseline benchmark (if no baseline exists)
	if: steps.restore-baseline-metal.outputs.cache-hit != 'true'
	run: \|
	git checkout ${{ github.event.pull_request.base.sha \|\| github.event.before \|\| 'master' }} \|\| true
	cmake --build build --target llama-bench -j $(sysctl -n hw.logicalcpu) \|\| true
	./build/bin/llama-bench \
	-m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
	-ngl 99 -p 512 -n 128 -r 3 \
	-o sql \| sqlite3 ${{ env.BASELINE_DB }} \|\| true
	git checkout -

	- name: Run current benchmark
	run: \|
	./build/bin/llama-bench \
	-m models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
	-ngl 99 -p 512 -n 128 -r 3 \
	-o sql \| sqlite3 ${{ env.RESULTS_DB }}

	- name: Detect performance regressions
	id: detect-regression-metal
	run: \|
	python scripts/performance-regression-detector.py \
	--baseline ${{ env.BASELINE_DB }} \
	--current ${{ env.RESULTS_DB }} \
	--threshold ${{ env.REGRESSION_THRESHOLD }} \
	--output regression-report-metal.md

	if [ -f regression-detected.flag ]; then
	echo "regression=true" >> $GITHUB_OUTPUT
	else
	echo "regression=false" >> $GITHUB_OUTPUT
	fi

	- name: Upload regression report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: performance-report-metal
	path: \|
	regression-report-metal.md
	${{ env.RESULTS_DB }}
	${{ env.BASELINE_DB }}

	- name: Comment on PR with results
	if: github.event_name == 'pull_request' && always()
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	let report = 'Performance Regression Test Results (Metal)\n\n';

	if (fs.existsSync('regression-report-metal.md')) {
	report += fs.readFileSync('regression-report-metal.md', 'utf8');
	} else {
	report += 'No regression report generated.';
	}

	github.rest.issues.createComment({
	issue_number: context.issue.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: report
	});

	- name: Fail if regression detected
	if: steps.detect-regression-metal.outputs.regression == 'true'
	run: \|
	echo "⚠️ Performance regression detected! Check the report for details."
	exit 1

	- name: Save baseline database
	if: github.event_name == 'push' && github.ref == 'refs/heads/master'
	uses: actions/cache/save@v4
	with:
	path: ${{ env.RESULTS_DB }}
	key: perf-baseline-metal-master-${{ github.sha }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Automated Performance Regression Testing System (AT-105) #2

Workflow file

Automated Performance Regression Testing System (AT-105) #2

Uh oh!

Jobs

Run details

Workflow file for this run