Terminal-Bench #33
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This is a terminal-bench workflow that is manually triggered | |
| # Template taken from https://github.com/actions/starter-workflows/blob/main/automation/manual.yml for reference | |
| name: Terminal-Bench | |
| # Controls when the action will run. Workflow runs when manually triggered using the UI | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| git_commit_hash: | |
| description: 'Input git commit hash to run TB on (must exist on S3)' | |
| required: true | |
| default: 'latest' | |
| type: string | |
| jobs: | |
| run-benchmark: | |
| # avoids disk storage issues | |
| runs-on: ubuntu-latest | |
| # makes these env vars available in main.py | |
| env: | |
| CHAT_DOWNLOAD_ROLE_ARN: ${{ secrets.CHAT_DOWNLOAD_ROLE_ARN }} | |
| CHAT_BUILD_BUCKET_NAME: ${{ secrets.CHAT_BUILD_BUCKET_NAME }} | |
| GIT_HASH: ${{ github.event.inputs.git_commit_hash }} | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| # clear unnecessary storage to ensure docker containers have space | |
| - name: Cleanup and free disk space | |
| run: | | |
| sudo rm -rf /usr/share/dotnet | |
| sudo rm -rf /opt/ghc | |
| sudo rm -rf "/usr/local/share/boost" | |
| sudo rm -rf "$AGENT_TOOLSDIRECTORY" | |
| sudo rm -rf /usr/local/lib/android | |
| sudo rm -rf /usr/share/swift | |
| sudo apt-get clean | |
| df -h | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.13' | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install terminal-bench | |
| # OIDC enabled for github for ArjunPersonal | |
| - name: Configure AWS credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.AWS_TB_ROLE }} | |
| aws-region: us-east-1 | |
| - name: Run terminal benchmark | |
| run: | | |
| cd terminal-bench-test | |
| tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --n-tasks=20 --cleanup | |
| # uploads results if run fails as well to allow for easy log inspection | |
| - name: Upload results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results | |
| path: terminal-bench-test/runs/ |