Skip to content

Commit 79a124f

Browse files
authored
🤖 Add workflow_dispatch for Terminal-Bench (#290)
_Generated with `cmux`_ Adds a manually-triggerable GitHub Actions workflow for running `make benchmark-terminal`. **Features:** - Workflow can be triggered from GitHub Actions UI with custom parameters - 3 hour timeout to accommodate long-running benchmarks - Installs `uv` for `uvx terminal-bench` execution - Configurable inputs: - Dataset (default: `terminal-bench-core==0.1.1`) - Concurrency (default: `4`) - Livestream (default: `true` for progress visibility) - Extra args for custom options - Uploads benchmark results as artifacts (even on failure) - Uses ANTHROPIC_API_KEY and OPENAI_API_KEY from secrets
1 parent 062f568 commit 79a124f

File tree

4 files changed

+93
-3
lines changed

4 files changed

+93
-3
lines changed

.github/workflows/ci.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ jobs:
4646
extra_nix_config: |
4747
experimental-features = nix-command flakes
4848
49+
- name: Install uv
50+
run: curl -LsSf https://astral.sh/uv/install.sh | sh
51+
52+
- name: Add uv to PATH
53+
run: echo "$HOME/.local/bin" >> $GITHUB_PATH
54+
4955
- name: Run static checks
5056
run: make -j3 static-check
5157

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
name: Terminal-Bench
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
dataset:
7+
description: 'Terminal-Bench dataset to use'
8+
required: false
9+
default: 'terminal-bench-core==0.1.1'
10+
type: string
11+
concurrency:
12+
description: 'Number of concurrent tasks (--n-concurrent)'
13+
required: false
14+
default: '4'
15+
type: string
16+
livestream:
17+
description: 'Enable livestream mode'
18+
required: false
19+
default: true
20+
type: boolean
21+
extra_args:
22+
description: 'Additional arguments to pass to terminal-bench'
23+
required: false
24+
type: string
25+
26+
jobs:
27+
benchmark:
28+
name: Run Terminal-Bench
29+
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
30+
timeout-minutes: 180 # 3 hours - terminal-bench can take a long time
31+
steps:
32+
- name: Checkout code
33+
uses: actions/checkout@v4
34+
with:
35+
fetch-depth: 0 # Required for git describe to find tags
36+
37+
- uses: ./.github/actions/setup-cmux
38+
39+
- name: Install uv
40+
run: curl -LsSf https://astral.sh/uv/install.sh | sh
41+
42+
- name: Add uv to PATH
43+
run: echo "$HOME/.local/bin" >> $GITHUB_PATH
44+
45+
- name: Generate version file
46+
run: ./scripts/generate-version.sh
47+
48+
- name: Run Terminal-Bench
49+
run: make benchmark-terminal
50+
env:
51+
TB_DATASET: ${{ inputs.dataset }}
52+
TB_CONCURRENCY: ${{ inputs.concurrency }}
53+
TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
54+
TB_ARGS: ${{ inputs.extra_args }}
55+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
56+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
57+
58+
- name: Upload benchmark results
59+
if: always()
60+
uses: actions/upload-artifact@v4
61+
with:
62+
name: terminal-bench-results
63+
path: |
64+
terminal-bench-results/
65+
*.json
66+
if-no-files-found: warn
67+

benchmarks/__init__.py

Whitespace-only changes.

fmt.mk

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,25 @@
33
# This file contains all code formatting logic.
44
# Included by the main Makefile.
55

6-
.PHONY: fmt fmt-check fmt-prettier fmt-prettier-check fmt-shell fmt-shell-check fmt-nix fmt-nix-check
6+
.PHONY: fmt fmt-check fmt-prettier fmt-prettier-check fmt-shell fmt-shell-check fmt-nix fmt-nix-check fmt-python fmt-python-check
77

88
# Centralized patterns - single source of truth
99
PRETTIER_PATTERNS := 'src/**/*.{ts,tsx,json}' 'tests/**/*.ts' 'docs/**/*.md' 'package.json' 'tsconfig*.json' 'README.md'
1010
SHELL_SCRIPTS := scripts
11+
PYTHON_DIRS := benchmarks
1112

1213
# Always use bun x prettier for reproducibility (uses package.json version)
1314
PRETTIER := bun x prettier
1415

1516
# Tool availability checks
1617
SHFMT := $(shell command -v shfmt 2>/dev/null)
1718
NIX := $(shell command -v nix 2>/dev/null)
19+
UVX := $(shell command -v uvx 2>/dev/null)
1820

19-
fmt: fmt-prettier fmt-shell fmt-nix
21+
fmt: fmt-prettier fmt-shell fmt-python fmt-nix
2022
@echo "==> All formatting complete!"
2123

22-
fmt-check: fmt-prettier-check fmt-shell-check fmt-nix-check
24+
fmt-check: fmt-prettier-check fmt-shell-check fmt-python-check fmt-nix-check
2325
@echo "==> All formatting checks passed!"
2426

2527
fmt-prettier:
@@ -48,6 +50,21 @@ else
4850
@shfmt -i 2 -ci -bn -d $(SHELL_SCRIPTS)
4951
endif
5052

53+
# Helper target to check for uvx
54+
.check-uvx:
55+
ifeq ($(UVX),)
56+
@echo "Error: uvx not found. Install with: curl -LsSf https://astral.sh/uv/install.sh | sh"
57+
@exit 1
58+
endif
59+
60+
fmt-python: .check-uvx
61+
@echo "Formatting Python files..."
62+
@uvx ruff format $(PYTHON_DIRS)
63+
64+
fmt-python-check: .check-uvx
65+
@echo "Checking Python formatting..."
66+
@uvx ruff format --check $(PYTHON_DIRS)
67+
5168
fmt-nix:
5269
ifeq ($(NIX),)
5370
@echo "Nix not found; skipping Nix formatting"

0 commit comments

Comments
 (0)