Skip to content

Add new model to benchmark #296

Add new model to benchmark

Add new model to benchmark #296

name: Add new model to benchmark
on:
workflow_dispatch:
inputs:
model:
description: "Model to benchmark"
required: true
default: "openai/gpt-4o"
jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '18'
cache: 'npm'
cache-dependency-path: src/package-lock.json
- name: Install dependencies
working-directory: src
run: npm ci
- name: Run benchmark
id: run-benchmark
working-directory: src
run: npm run benchmark -- --model="${{ github.event.inputs.model }}" --debug
env:
TINYBIRD_API_HOST: ${{ vars.TINYBIRD_API_HOST }}
TINYBIRD_WORKSPACE_TOKEN: ${{ secrets.WORKSPACE_TOKEN }}
WORKSPACE_TOKEN: ${{ secrets.WORKSPACE_TOKEN }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
- name: Report benchmark failure
if: failure() && steps.run-benchmark.outcome == 'failure'
working-directory: src
run: |
MODEL="${{ github.event.inputs.model }}"
PROVIDER=$(echo "$MODEL" | cut -d'/' -f1)
MODEL_NAME=$(echo "$MODEL" | cut -d'/' -f2)
echo "Benchmark failed for $MODEL"
echo "Adding to failed models list..."
# Add to failed models
npm run manage-failed-models add "$PROVIDER" "$MODEL_NAME" "$MODEL" "Benchmark workflow failed"
echo "Model $MODEL has been added to the failed list"
- name: Commit failure changes
if: failure() && steps.run-benchmark.outcome == 'failure'
run: |
git config --local user.email "action@github.com"
git config --local user.name "GitHub Action"
# Check if there are changes to commit
if git diff --quiet; then
echo "No changes to commit"
else
git add src/untested-models.json src/failed-models.json
git commit -m "Add failed model ${{ github.event.inputs.model }} to tracking lists
- Model: ${{ github.event.inputs.model }}
- Reason: Benchmark workflow failed
- Run ID: ${{ github.run_id }}
This prevents the model from being automatically benchmarked again."
git push origin main
echo "Failure changes committed and pushed to main branch"
fi
- name: Create normalized branch name
id: branch-name
run: |
MODEL_NAME="${{ github.event.inputs.model }}"
NORMALIZED_MODEL=$(echo "$MODEL_NAME" | sed 's/[\/\s_\.:]/-/g')
echo "branch_name=benchmark/$NORMALIZED_MODEL-${{ github.run_id }}" >> $GITHUB_OUTPUT
- name: Create Pull Request
id: create-pr
uses: peter-evans/create-pull-request@v6
with:
token: ${{ secrets.GITHUB_TOKEN }}
commit-message: "feat: add benchmark results for ${{ github.event.inputs.model }}"
title: "Add benchmark results for ${{ github.event.inputs.model }}"
body: |
This PR adds benchmark results for the **${{ github.event.inputs.model }}** model.
The following files have been updated:
- `src/benchmark/results.json` - Raw benchmark results
- `src/benchmark/validation-results.json` - Validation results against human baseline
This PR was automatically generated by the benchmark workflow.
**Note:** If you don't want to merge this PR, close it and the model will be added to the untested list to prevent re-processing.
@alrocar
branch: ${{ steps.branch-name.outputs.branch_name }}
base: main
delete-branch: true
- name: Monitor PR status
if: steps.create-pr.outputs.pull-request-number
uses: actions/github-script@v7
with:
script: |
const prNumber = ${{ steps.create-pr.outputs.pull-request-number }};
const model = '${{ github.event.inputs.model }}';
const [provider, modelName] = model.split('/');
// Wait a bit for the PR to be created
await new Promise(resolve => setTimeout(resolve, 5000));
try {
const pr = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: prNumber
});
if (pr.data.state === 'closed' && !pr.data.merged_at) {
console.log(`PR #${prNumber} was closed without merging for model ${model}`);
console.log('This model will be added to the untested list to prevent re-processing');
// Create a comment on the closed PR
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: `This model has been added to the untested list to prevent automatic re-processing.`
});
}
} catch (error) {
console.log('Could not check PR status:', error.message);
}