diff --git a/.agents/skills/run-eval.md b/.agents/skills/run-eval.md index 54e18775ef..2e33366b64 100644 --- a/.agents/skills/run-eval.md +++ b/.agents/skills/run-eval.md @@ -32,7 +32,7 @@ curl -X POST \ **Key parameters:** - `benchmark`: `swebench`, `swebenchmultimodal`, `gaia`, `swtbench`, `commit0`, `multiswebench` -- `eval_limit`: `1`, `50`, `100`, `200`, `500` +- `eval_limit`: Any positive integer (e.g., `1`, `10`, `50`, `200`) - `model_ids`: See `.github/run-eval/resolve_model_config.py` for available models - `benchmarks_branch`: Use feature branch from the benchmarks repo to test benchmark changes before merging diff --git a/.github/workflows/run-eval.yml b/.github/workflows/run-eval.yml index 62b8d5029c..4f04a63d8d 100644 --- a/.github/workflows/run-eval.yml +++ b/.github/workflows/run-eval.yml @@ -32,16 +32,10 @@ on: default: false type: boolean eval_limit: - description: Number of instances to run + description: Number of instances to run (any positive integer) required: false default: '1' - type: choice - options: - - '1' - - '100' - - '50' - - '200' - - '500' + type: string model_ids: description: Comma-separated model IDs to evaluate. Must be keys of MODELS in resolve_model_config.py. Defaults to first model in that dict. @@ -138,6 +132,14 @@ jobs: with: python-version: '3.13' + - name: Validate eval_limit + if: github.event_name == 'workflow_dispatch' + run: | + if ! [[ "${{ github.event.inputs.eval_limit }}" =~ ^[1-9][0-9]*$ ]]; then + echo "Error: eval_limit must be a positive integer, got: ${{ github.event.inputs.eval_limit }}" + exit 1 + fi + - name: Validate SDK reference (semantic version check) if: github.event_name == 'workflow_dispatch' env: