Terminal-Bench #45
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This is a terminal-bench workflow that is manually triggered | |
| # Template taken from https://github.com/actions/starter-workflows/blob/main/automation/manual.yml for reference | |
| name: Terminal-Bench | |
| # Controls when the action will run. Workflow runs when manually triggered using the UI | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| name: | |
| description: 'Run terminal-bench workflow to test Q CLI in real terminal environments.' | |
| default: 'all' | |
| required: true | |
| type: string | |
| jobs: | |
| run-benchmark: | |
| # avoids disk storage issues | |
| runs-on: ubuntu-latest | |
| # makes these env vars available in main.py | |
| env: | |
| CHAT_DOWNLOAD_ROLE_ARN: ${{ secrets.CHAT_DOWNLOAD_ROLE_ARN }} | |
| CHAT_BUILD_BUCKET_NAME: ${{ secrets.CHAT_BUILD_BUCKET_NAME }} | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| # clear unnecessary storage to ensure docker containers have space | |
| - name: Cleanup and free disk space | |
| run: | | |
| sudo rm -rf /usr/share/dotnet | |
| sudo rm -rf /opt/ghc | |
| sudo rm -rf "/usr/local/share/boost" | |
| sudo rm -rf "$AGENT_TOOLSDIRECTORY" | |
| sudo rm -rf /usr/local/lib/android | |
| sudo rm -rf /usr/share/swift | |
| sudo apt-get clean | |
| df -h | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| # Captures git hash of branch to query specific S3 bucket | |
| - name: Set git hash | |
| run: | | |
| if [ -n "$GITHUB_SHA" ]; then | |
| git_hash=$(git rev-parse "$GITHUB_SHA") | |
| else | |
| git_hash="latest" | |
| fi | |
| # appends to github_env file | |
| echo "GIT_HASH=$git_hash" >> $GITHUB_ENV | |
| echo "Git hash set to: $git_hash" | |
| - name: Set up Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.13' | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install terminal-bench | |
| # OIDC enabled for github for ArjunPersonal | |
| - name: Configure AWS credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.AWS_TB_ROLE }} | |
| aws-region: us-east-1 | |
| - name: Run terminal benchmark | |
| run: | | |
| cd terminal-bench-test | |
| tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head | |
| # uploads results if run fails as well to allow for easy log inspection | |
| - name: Upload results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results | |
| path: terminal-bench-test/runs/ |