|
12 | 12 |
|
13 | 13 | env:
|
14 | 14 | EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-3-5-sonnet-latest"
|
15 |
| - EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract" |
| 15 | + EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract,agent" |
16 | 16 | EVAL_MAX_CONCURRENCY: 25
|
17 | 17 | EVAL_TRIAL_COUNT: 5
|
18 | 18 |
|
|
29 | 29 | run-act: ${{ steps.check-labels.outputs.run-act }}
|
30 | 30 | run-observe: ${{ steps.check-labels.outputs.run-observe }}
|
31 | 31 | run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }}
|
| 32 | + run-agent: ${{ steps.check-labels.outputs.run-agent }} |
32 | 33 | steps:
|
33 | 34 | - id: check-labels
|
34 | 35 | run: |
|
|
40 | 41 | echo "run-act=true" >> $GITHUB_OUTPUT
|
41 | 42 | echo "run-observe=true" >> $GITHUB_OUTPUT
|
42 | 43 | echo "run-targeted-extract=true" >> $GITHUB_OUTPUT
|
| 44 | + echo "run-agent=true" >> $GITHUB_OUTPUT |
43 | 45 | exit 0
|
44 | 46 | fi
|
45 | 47 |
|
|
49 | 51 | echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
|
50 | 52 | echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
|
51 | 53 | echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT
|
| 54 | + echo "run-agent=${{ contains(github.event.pull_request.labels.*.name, 'agent') }}" >> $GITHUB_OUTPUT |
52 | 55 |
|
53 | 56 | run-lint:
|
54 | 57 | runs-on: ubuntu-latest
|
@@ -562,3 +565,73 @@ jobs:
|
562 | 565 | echo "Eval summary not found for targeted_extract category. Failing CI."
|
563 | 566 | exit 1
|
564 | 567 | fi
|
| 568 | +
|
| 569 | + run-agent-evals: |
| 570 | + needs: [run-targeted-extract-evals, determine-evals] |
| 571 | + runs-on: ubuntu-latest |
| 572 | + timeout-minutes: 90 # Agent evals can be long-running |
| 573 | + env: |
| 574 | + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
| 575 | + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} |
| 576 | + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} |
| 577 | + BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} |
| 578 | + BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} |
| 579 | + BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} |
| 580 | + HEADLESS: true |
| 581 | + EVAL_ENV: browserbase |
| 582 | + # Use agent models for agent evals in CI |
| 583 | + EVAL_AGENT_MODELS: "computer-use-preview-2025-03-11,claude-3-7-sonnet-latest" |
| 584 | + EVAL_TRIAL_COUNT: 2 # Reduce trials for agent evals |
| 585 | + EVAL_MAX_CONCURRENCY: 10 # Lower concurrency for agent evals |
| 586 | + steps: |
| 587 | + - name: Check out repository code |
| 588 | + uses: actions/checkout@v4 |
| 589 | + |
| 590 | + - name: Check for 'agent' label |
| 591 | + id: label-check |
| 592 | + run: | |
| 593 | + if [ "${{ needs.determine-evals.outputs.run-agent }}" != "true" ]; then |
| 594 | + echo "has_label=false" >> $GITHUB_OUTPUT |
| 595 | + echo "No label for AGENT. Exiting with success." |
| 596 | + else |
| 597 | + echo "has_label=true" >> $GITHUB_OUTPUT |
| 598 | + fi |
| 599 | +
|
| 600 | + - name: Set up Node.js |
| 601 | + if: needs.determine-evals.outputs.run-agent == 'true' |
| 602 | + uses: actions/setup-node@v4 |
| 603 | + with: |
| 604 | + node-version: "20" |
| 605 | + |
| 606 | + - name: Install dependencies |
| 607 | + if: needs.determine-evals.outputs.run-agent == 'true' |
| 608 | + run: | |
| 609 | + rm -rf node_modules |
| 610 | + npm i -g pnpm |
| 611 | + pnpm install --no-frozen-lockfile |
| 612 | +
|
| 613 | + - name: Build Stagehand |
| 614 | + if: needs.determine-evals.outputs.run-agent == 'true' |
| 615 | + run: pnpm run build |
| 616 | + |
| 617 | + - name: Run Agent Evals |
| 618 | + if: needs.determine-evals.outputs.run-agent == 'true' |
| 619 | + run: pnpm run evals category agent |
| 620 | + |
| 621 | + - name: Log Agent Evals Performance |
| 622 | + if: needs.determine-evals.outputs.run-agent == 'true' |
| 623 | + run: | |
| 624 | + experimentName=$(jq -r '.experimentName' eval-summary.json) |
| 625 | + echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" |
| 626 | + if [ -f eval-summary.json ]; then |
| 627 | + agent_score=$(jq '.categories.agent' eval-summary.json) |
| 628 | + echo "Agent category score: $agent_score%" |
| 629 | + # Lower threshold for agent evals since they're complex |
| 630 | + if (( $(echo "$agent_score < 50" | bc -l) )); then |
| 631 | + echo "Agent category score is below 50%. Failing CI." |
| 632 | + exit 1 |
| 633 | + fi |
| 634 | + else |
| 635 | + echo "Eval summary not found for agent category. Failing CI." |
| 636 | + exit 1 |
| 637 | + fi |
0 commit comments