Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/chromatic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,3 @@ jobs:
projectToken: ${{ secrets.CHROMATIC_PROJECT_TOKEN }}
exitZeroOnChanges: true
onlyChanged: true

6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ jobs:

- uses: ./.github/actions/setup-cmux

- name: Build worker files
run: make build-main

- name: Run tests with coverage
run: bun test --coverage --coverage-reporter=lcov ${{ github.event.inputs.test_filter || 'src' }}

Expand All @@ -96,6 +99,9 @@ jobs:

- uses: ./.github/actions/setup-cmux

- name: Build worker files
run: make build-main

- name: Run integration tests with coverage
# --silent suppresses per-test output (17 test files × 32 workers = overwhelming logs)
run: TEST_INTEGRATION=1 bun x jest --coverage --maxWorkers=100% --silent ${{ github.event.inputs.test_filter || 'tests' }}
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/nightly-terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ name: Nightly Terminal-Bench
on:
schedule:
# Run full benchmark suite (~80 tasks) every night at midnight UTC
- cron: '0 0 * * *'
- cron: "0 0 * * *"
workflow_dispatch:
inputs:
models:
description: 'Models to test (comma-separated, or "all" for both)'
required: false
default: 'all'
default: "all"
type: string

jobs:
Expand Down Expand Up @@ -41,9 +41,9 @@ jobs:
uses: ./.github/workflows/terminal-bench.yml
with:
model_name: ${{ matrix.model }}
thinking_level: 'high'
dataset: 'terminal-bench-core==0.1.1'
concurrency: '4'
thinking_level: "high"
dataset: "terminal-bench-core==0.1.1"
concurrency: "4"
livestream: true
secrets:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/publish-npm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
branches:
- main
tags:
- 'v*'
- "v*"
workflow_dispatch:

permissions:
Expand All @@ -24,12 +24,12 @@ jobs:

- uses: ./.github/actions/setup-cmux
with:
install-imagemagick: 'true'
install-imagemagick: "true"

# Sets up .npmrc with the auth token
- uses: actions/setup-node@v4
with:
registry-url: 'https://registry.npmjs.org'
registry-url: "https://registry.npmjs.org"

- run: sudo npm i -g npm@latest

Expand All @@ -38,10 +38,10 @@ jobs:
run: |
# Get base version from package.json
BASE_VERSION=$(node -p "require('./package.json').version")

# Generate git describe version
GIT_DESCRIBE=$(git describe --tags --always --dirty 2>/dev/null || echo "unknown")

if [[ $GITHUB_REF == refs/tags/* ]]; then
# For tags, use the base version as-is (stable release)
NPM_VERSION="${BASE_VERSION}"
Expand All @@ -56,13 +56,13 @@ jobs:
NPM_TAG="next"
echo "Publishing pre-release: ${NPM_VERSION}"
fi

echo "version=${NPM_VERSION}" >> $GITHUB_OUTPUT
echo "tag=${NPM_TAG}" >> $GITHUB_OUTPUT

# Update package.json with the new version
node -e "const fs = require('fs'); const pkg = JSON.parse(fs.readFileSync('package.json')); pkg.version = '${NPM_VERSION}'; fs.writeFileSync('package.json', JSON.stringify(pkg, null, 2) + '\n');"

echo "Updated package.json to version ${NPM_VERSION}"

- name: Generate version file
Expand All @@ -76,7 +76,7 @@ jobs:
run: |
PACKAGE_NAME=$(node -p "require('./package.json').name")
VERSION="${{ steps.version.outputs.version }}"

if npm view "${PACKAGE_NAME}@${VERSION}" version &>/dev/null; then
echo "exists=true" >> $GITHUB_OUTPUT
echo "Version ${VERSION} already exists on npm"
Expand All @@ -95,7 +95,7 @@ jobs:
PACKAGE_NAME=$(node -p "require('./package.json').name")
VERSION="${{ steps.version.outputs.version }}"
TAG="${{ steps.version.outputs.tag }}"

echo "Version ${VERSION} already published, updating dist-tag to ${TAG}"
npm dist-tag add "${PACKAGE_NAME}@${VERSION}" "${TAG}"

Expand Down
37 changes: 18 additions & 19 deletions .github/workflows/terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,34 @@ on:
workflow_call:
inputs:
model_name:
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)'
description: "Model to use (e.g., anthropic:claude-sonnet-4-5)"
required: false
type: string
thinking_level:
description: 'Thinking level (off, low, medium, high)'
description: "Thinking level (off, low, medium, high)"
required: false
type: string
dataset:
description: 'Terminal-Bench dataset to use'
description: "Terminal-Bench dataset to use"
required: false
type: string
default: 'terminal-bench-core==0.1.1'
default: "terminal-bench-core==0.1.1"
concurrency:
description: 'Number of concurrent tasks (--n-concurrent)'
description: "Number of concurrent tasks (--n-concurrent)"
required: false
type: string
default: '4'
default: "4"
livestream:
description: 'Enable livestream mode'
description: "Enable livestream mode"
required: false
type: boolean
default: true
sample_size:
description: 'Number of random tasks to run (empty = all tasks)'
description: "Number of random tasks to run (empty = all tasks)"
required: false
type: string
extra_args:
description: 'Additional arguments to pass to terminal-bench'
description: "Additional arguments to pass to terminal-bench"
required: false
type: string
secrets:
Expand All @@ -42,34 +42,34 @@ on:
workflow_dispatch:
inputs:
dataset:
description: 'Terminal-Bench dataset to use'
description: "Terminal-Bench dataset to use"
required: false
default: 'terminal-bench-core==0.1.1'
default: "terminal-bench-core==0.1.1"
type: string
concurrency:
description: 'Number of concurrent tasks (--n-concurrent)'
description: "Number of concurrent tasks (--n-concurrent)"
required: false
default: '4'
default: "4"
type: string
livestream:
description: 'Enable livestream mode'
description: "Enable livestream mode"
required: false
default: true
type: boolean
sample_size:
description: 'Number of random tasks to run (empty = all tasks)'
description: "Number of random tasks to run (empty = all tasks)"
required: false
type: string
model_name:
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
description: "Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)"
required: false
type: string
thinking_level:
description: 'Thinking level (off, low, medium, high)'
description: "Thinking level (off, low, medium, high)"
required: false
type: string
extra_args:
description: 'Additional arguments to pass to terminal-bench'
description: "Additional arguments to pass to terminal-bench"
required: false
type: string

Expand Down Expand Up @@ -148,4 +148,3 @@ jobs:
runs/
if-no-files-found: warn
retention-days: 30

6 changes: 1 addition & 5 deletions .storybook/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@ import path from "path";

const config: StorybookConfig = {
stories: ["../src/**/*.stories.@(ts|tsx)"],
addons: [
"@storybook/addon-links",
"@storybook/addon-docs",
"@storybook/addon-interactions",
],
addons: ["@storybook/addon-links", "@storybook/addon-docs", "@storybook/addon-interactions"],
framework: {
name: "@storybook/react-vite",
options: {},
Expand Down
1 change: 0 additions & 1 deletion .storybook/mocks/version.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,3 @@ export const VERSION = {
git_describe: "v1.0.0",
buildTime: "2024-01-24T17:41:00Z", // 9:41 AM PST
};

6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,11 @@ check-deadcode: node_modules/.installed ## Check for potential dead code (manual
|| echo "✓ No obvious dead code found"

## Testing
test-integration: node_modules/.installed ## Run all tests (unit + integration)
test-integration: node_modules/.installed build-main ## Run all tests (unit + integration)
@bun test src
@TEST_INTEGRATION=1 bun x jest tests

test-unit: node_modules/.installed ## Run unit tests
test-unit: node_modules/.installed build-main ## Run unit tests
@bun test src

test: test-unit ## Alias for test-unit
Expand All @@ -220,7 +220,7 @@ test-coverage: ## Run tests with coverage

test-e2e: ## Run end-to-end tests
@$(MAKE) build
@CMUX_E2E_LOAD_DIST=1 CMUX_E2E_SKIP_BUILD=1 PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 bun x playwright test --project=electron
@CMUX_E2E_LOAD_DIST=1 CMUX_E2E_SKIP_BUILD=1 PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 bun x playwright test --project=electron $(PLAYWRIGHT_ARGS)

## Distribution
dist: build ## Build distributable packages
Expand Down
1 change: 1 addition & 0 deletions benchmarks/terminal_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ The benchmark uses a **global timeout** applied to all tasks. The default is **3
**Design Rationale:**

Based on analysis of Oct 30, 2025 nightly runs:

- Longest successful task: `blind-maze-explorer-algorithm.hard` at 20 minutes
- 95th percentile: ~15 minutes
- Mean duration: ~6 minutes
Expand Down
Loading