Add new model to benchmark #296
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Add new model to benchmark | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| model: | |
| description: "Model to benchmark" | |
| required: true | |
| default: "openai/gpt-4o" | |
| jobs: | |
| benchmark: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '18' | |
| cache: 'npm' | |
| cache-dependency-path: src/package-lock.json | |
| - name: Install dependencies | |
| working-directory: src | |
| run: npm ci | |
| - name: Run benchmark | |
| id: run-benchmark | |
| working-directory: src | |
| run: npm run benchmark -- --model="${{ github.event.inputs.model }}" --debug | |
| env: | |
| TINYBIRD_API_HOST: ${{ vars.TINYBIRD_API_HOST }} | |
| TINYBIRD_WORKSPACE_TOKEN: ${{ secrets.WORKSPACE_TOKEN }} | |
| WORKSPACE_TOKEN: ${{ secrets.WORKSPACE_TOKEN }} | |
| OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} | |
| - name: Report benchmark failure | |
| if: failure() && steps.run-benchmark.outcome == 'failure' | |
| working-directory: src | |
| run: | | |
| MODEL="${{ github.event.inputs.model }}" | |
| PROVIDER=$(echo "$MODEL" | cut -d'/' -f1) | |
| MODEL_NAME=$(echo "$MODEL" | cut -d'/' -f2) | |
| echo "Benchmark failed for $MODEL" | |
| echo "Adding to failed models list..." | |
| # Add to failed models | |
| npm run manage-failed-models add "$PROVIDER" "$MODEL_NAME" "$MODEL" "Benchmark workflow failed" | |
| echo "Model $MODEL has been added to the failed list" | |
| - name: Commit failure changes | |
| if: failure() && steps.run-benchmark.outcome == 'failure' | |
| run: | | |
| git config --local user.email "action@github.com" | |
| git config --local user.name "GitHub Action" | |
| # Check if there are changes to commit | |
| if git diff --quiet; then | |
| echo "No changes to commit" | |
| else | |
| git add src/untested-models.json src/failed-models.json | |
| git commit -m "Add failed model ${{ github.event.inputs.model }} to tracking lists | |
| - Model: ${{ github.event.inputs.model }} | |
| - Reason: Benchmark workflow failed | |
| - Run ID: ${{ github.run_id }} | |
| This prevents the model from being automatically benchmarked again." | |
| git push origin main | |
| echo "Failure changes committed and pushed to main branch" | |
| fi | |
| - name: Create normalized branch name | |
| id: branch-name | |
| run: | | |
| MODEL_NAME="${{ github.event.inputs.model }}" | |
| NORMALIZED_MODEL=$(echo "$MODEL_NAME" | sed 's/[\/\s_\.:]/-/g') | |
| echo "branch_name=benchmark/$NORMALIZED_MODEL-${{ github.run_id }}" >> $GITHUB_OUTPUT | |
| - name: Create Pull Request | |
| id: create-pr | |
| uses: peter-evans/create-pull-request@v6 | |
| with: | |
| token: ${{ secrets.GITHUB_TOKEN }} | |
| commit-message: "feat: add benchmark results for ${{ github.event.inputs.model }}" | |
| title: "Add benchmark results for ${{ github.event.inputs.model }}" | |
| body: | | |
| This PR adds benchmark results for the **${{ github.event.inputs.model }}** model. | |
| The following files have been updated: | |
| - `src/benchmark/results.json` - Raw benchmark results | |
| - `src/benchmark/validation-results.json` - Validation results against human baseline | |
| This PR was automatically generated by the benchmark workflow. | |
| **Note:** If you don't want to merge this PR, close it and the model will be added to the untested list to prevent re-processing. | |
| @alrocar | |
| branch: ${{ steps.branch-name.outputs.branch_name }} | |
| base: main | |
| delete-branch: true | |
| - name: Monitor PR status | |
| if: steps.create-pr.outputs.pull-request-number | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const prNumber = ${{ steps.create-pr.outputs.pull-request-number }}; | |
| const model = '${{ github.event.inputs.model }}'; | |
| const [provider, modelName] = model.split('/'); | |
| // Wait a bit for the PR to be created | |
| await new Promise(resolve => setTimeout(resolve, 5000)); | |
| try { | |
| const pr = await github.rest.pulls.get({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| pull_number: prNumber | |
| }); | |
| if (pr.data.state === 'closed' && !pr.data.merged_at) { | |
| console.log(`PR #${prNumber} was closed without merging for model ${model}`); | |
| console.log('This model will be added to the untested list to prevent re-processing'); | |
| // Create a comment on the closed PR | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: prNumber, | |
| body: `This model has been added to the untested list to prevent automatic re-processing.` | |
| }); | |
| } | |
| } catch (error) { | |
| console.log('Could not check PR status:', error.message); | |
| } |