Run Eval (swebench) test fix/tool-call-compat-shim #3086
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| name: Run Eval | |
| run-name: Run Eval (${{ inputs.benchmark || 'swebench' }}) ${{ inputs.reason || github.event.label.name || 'release' }} | |
| on: | |
| pull_request_target: | |
| types: [labeled] | |
| release: | |
| types: [published] | |
| workflow_dispatch: | |
| inputs: | |
| benchmark: | |
| description: Benchmark to evaluate | |
| required: false | |
| default: swebench | |
| type: choice | |
| options: | |
| - gaia | |
| - swebench | |
| - swtbench | |
| - commit0 | |
| - swebenchmultimodal | |
| - terminalbench | |
| sdk_ref: | |
| description: SDK commit/ref to evaluate (must be a semantic version like v1.0.0 unless 'Allow unreleased branches' is checked) | |
| required: true | |
| default: v1.16.1 | |
| allow_unreleased_branches: | |
| description: Allow unreleased branches (bypasses semantic version requirement) | |
| required: false | |
| default: false | |
| type: boolean | |
| eval_limit: | |
| description: Number of instances to run (any positive integer) | |
| required: false | |
| default: '1' | |
| type: string | |
| model_ids: | |
| description: Comma-separated model IDs to evaluate. Must be keys of MODELS in resolve_model_config.py. Defaults to first model in that | |
| dict. | |
| required: false | |
| default: '' | |
| type: string | |
| reason: | |
| description: Reason for manual trigger | |
| required: false | |
| default: '' | |
| eval_branch: | |
| description: Evaluation repo branch to use (for testing feature branches) | |
| required: false | |
| default: main | |
| type: string | |
| benchmarks_branch: | |
| description: Benchmarks repo branch to use (for testing feature branches) | |
| required: false | |
| default: main | |
| type: string | |
| instance_ids: | |
| description: >- | |
| Comma-separated instance IDs to evaluate. | |
| Example: "django__django-11583,django__django-12345". | |
| Spaces around commas are automatically stripped. | |
| Leave empty to evaluate all instances up to eval_limit. | |
| required: false | |
| default: '' | |
| num_infer_workers: | |
| description: Number of inference workers (optional, overrides benchmark default) | |
| required: false | |
| default: '' | |
| type: string | |
| num_eval_workers: | |
| description: Number of evaluation workers (optional, overrides benchmark default) | |
| required: false | |
| default: '' | |
| type: string | |
| enable_conversation_event_logging: | |
| description: 'Enable Datadog persistence for conversation events (default: true)' | |
| required: false | |
| default: true | |
| type: boolean | |
| max_retries: | |
| description: Max retries per instance (passed to benchmarks) | |
| required: false | |
| default: '3' | |
| type: string | |
| tool_preset: | |
| description: >- | |
| Tool preset for file editing. 'default' uses FileEditorTool, | |
| 'gemini' uses read_file/write_file/edit/list_directory, | |
| 'gpt5' uses apply_patch tool. | |
| required: false | |
| default: default | |
| type: choice | |
| options: | |
| - default | |
| - gemini | |
| - gpt5 | |
| - planning | |
| agent_type: | |
| description: >- | |
| Agent type: 'default' for standard Agent, | |
| 'acp-claude' for ACPAgent with Claude Code, | |
| 'acp-codex' for ACPAgent with Codex, | |
| 'acp-gemini' for ACPAgent with Gemini CLI. | |
| required: false | |
| default: default | |
| type: choice | |
| options: | |
| - default | |
| - acp-claude | |
| - acp-codex | |
| - acp-gemini | |
| partial_archive_url: | |
| description: Resume partial work from full archive tar.gz | |
| required: false | |
| default: '' | |
| type: string | |
| env: | |
| EVAL_REPO: OpenHands/evaluation | |
| EVAL_WORKFLOW: eval-job.yml | |
| jobs: | |
| print-parameters: | |
| if: > | |
| github.event_name == 'release' || | |
| github.event_name == 'workflow_dispatch' || | |
| (github.event_name == 'pull_request_target' && | |
| (github.event.label.name == 'run-eval-1' || | |
| github.event.label.name == 'run-eval-50' || | |
| github.event.label.name == 'run-eval-200' || | |
| github.event.label.name == 'run-eval-500')) | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Print all parameters | |
| run: | | |
| echo "=== Workflow Parameters ===" | |
| echo "Event: ${{ github.event_name }}" | |
| echo "Actor: ${{ github.actor }}" | |
| echo "Ref: ${{ github.ref }}" | |
| echo "" | |
| echo "=== Input Parameters ===" | |
| echo "benchmark: ${{ github.event.inputs.benchmark || 'swebench' }}" | |
| echo "sdk_ref: ${{ github.event.inputs.sdk_ref || 'N/A' }}" | |
| echo "allow_unreleased_branches: ${{ github.event.inputs.allow_unreleased_branches || 'false' }}" | |
| echo "eval_limit: ${{ github.event.inputs.eval_limit || '1' }}" | |
| echo "model_ids: ${{ github.event.inputs.model_ids || '(default)' }}" | |
| echo "reason: ${{ github.event.inputs.reason || 'N/A' }}" | |
| echo "eval_branch: ${{ github.event.inputs.eval_branch || 'main' }}" | |
| echo "benchmarks_branch: ${{ github.event.inputs.benchmarks_branch || 'main' }}" | |
| echo "instance_ids: ${{ github.event.inputs.instance_ids || 'N/A' }}" | |
| echo "num_infer_workers: ${{ github.event.inputs.num_infer_workers || '(default)' }}" | |
| echo "num_eval_workers: ${{ github.event.inputs.num_eval_workers || '(default)' }}" | |
| echo "enable_conversation_event_logging: ${{ github.event.inputs.enable_conversation_event_logging || 'true' }}" | |
| echo "max_retries: ${{ github.event.inputs.max_retries || '3' }}" | |
| echo "tool_preset: ${{ github.event.inputs.tool_preset || 'default' }}" | |
| echo "partial_archive_url: ${{ github.event.inputs.partial_archive_url || 'N/A' }}" | |
| echo "" | |
| echo "=== Environment Variables ===" | |
| echo "EVAL_REPO: ${{ env.EVAL_REPO }}" | |
| echo "EVAL_WORKFLOW: ${{ env.EVAL_WORKFLOW }}" | |
| echo "" | |
| echo "=== Label (for PR events) ===" | |
| echo "Label: ${{ github.event.label.name || 'N/A' }}" | |
| build-and-evaluate: | |
| needs: print-parameters | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| packages: write | |
| actions: write | |
| issues: write | |
| pull-requests: write | |
| steps: | |
| - name: Checkout sdk code (base for validation) | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.sdk_ref || (github.event_name == 'pull_request_target' && | |
| github.event.pull_request.base.ref || github.ref) }} | |
| fetch-depth: 0 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.13' | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| version: latest | |
| python-version: '3.13' | |
| - name: Validate eval_limit | |
| if: github.event_name == 'workflow_dispatch' | |
| run: | | |
| if ! [[ "${{ github.event.inputs.eval_limit }}" =~ ^[1-9][0-9]*$ ]]; then | |
| echo "Error: eval_limit must be a positive integer, got: ${{ github.event.inputs.eval_limit }}" | |
| exit 1 | |
| fi | |
| - name: Validate SDK reference and workflow branches | |
| if: github.event_name == 'workflow_dispatch' | |
| env: | |
| SDK_REF: ${{ github.event.inputs.sdk_ref }} | |
| ALLOW_UNRELEASED_BRANCHES: ${{ github.event.inputs.allow_unreleased_branches }} | |
| EVAL_BRANCH: ${{ github.event.inputs.eval_branch || 'main' }} | |
| BENCHMARKS_BRANCH: ${{ github.event.inputs.benchmarks_branch || 'main' }} | |
| run: | | |
| python3 .github/run-eval/validate_sdk_ref.py | |
| - name: Sync locked workspace dependencies | |
| run: | | |
| uv sync --frozen | |
| - name: Load model IDs from Python script | |
| id: load-models | |
| run: | | |
| # Extract all model IDs from resolve_model_config.py | |
| ALLOWED_MODEL_IDS=$(uv run python << 'EOF' | |
| import sys | |
| sys.path.insert(0, '.github/run-eval') | |
| from resolve_model_config import MODELS | |
| import json | |
| print(json.dumps(list(MODELS.keys()))) | |
| EOF | |
| ) | |
| DEFAULT_MODEL=$(echo "$ALLOWED_MODEL_IDS" | jq -r '.[0]') | |
| if [ -z "$DEFAULT_MODEL" ] || [ "$DEFAULT_MODEL" = "null" ]; then | |
| echo "No models configured" >&2 | |
| exit 1 | |
| fi | |
| echo "allowed_model_ids=$ALLOWED_MODEL_IDS" >> "$GITHUB_OUTPUT" | |
| echo "default_model=$DEFAULT_MODEL" >> "$GITHUB_OUTPUT" | |
| - name: Resolve parameters | |
| id: params | |
| env: | |
| DEFAULT_MODEL: ${{ steps.load-models.outputs.default_model }} | |
| ALLOWED_MODEL_IDS_JSON: ${{ steps.load-models.outputs.allowed_model_ids }} | |
| PAT_TOKEN_DEFAULT: ${{ secrets.PAT_TOKEN }} | |
| run: | | |
| set -euo pipefail | |
| # Set PAT token for cross-repo workflow dispatch | |
| PAT_TOKEN="$PAT_TOKEN_DEFAULT" | |
| if [ -z "$PAT_TOKEN" ]; then | |
| echo "Missing PAT token" >&2 | |
| exit 1 | |
| fi | |
| echo "PAT_TOKEN=$PAT_TOKEN" >> "$GITHUB_ENV" | |
| # Determine eval limit and SDK SHA based on trigger | |
| if [ "${{ github.event_name }}" = "pull_request_target" ]; then | |
| LABEL="${{ github.event.label.name }}" | |
| case "$LABEL" in | |
| run-eval-1) EVAL_LIMIT=1 ;; | |
| run-eval-50) EVAL_LIMIT=50 ;; | |
| run-eval-200) EVAL_LIMIT=200 ;; | |
| run-eval-500) EVAL_LIMIT=500 ;; | |
| *) echo "Unsupported label $LABEL" >&2; exit 1 ;; | |
| esac | |
| SDK_SHA="${{ github.event.pull_request.head.sha }}" | |
| PR_NUMBER="${{ github.event.pull_request.number }}" | |
| TRIGGER_DESCRIPTION="Label '${LABEL}' on PR #${PR_NUMBER}" | |
| elif [ "${{ github.event_name }}" = "release" ]; then | |
| EVAL_LIMIT=50 | |
| # Use tag instead of target_commitish because release branches are automatically deleted after merge | |
| SDK_SHA=$(git rev-parse "${{ github.event.release.tag_name }}") | |
| PR_NUMBER="" | |
| TRIGGER_DESCRIPTION="Release ${{ github.event.release.tag_name }}" | |
| else | |
| EVAL_LIMIT="${{ github.event.inputs.eval_limit }}" | |
| SDK_REF="${{ github.event.inputs.sdk_ref }}" | |
| # Convert ref to SHA for manual dispatch | |
| # Resolve SHA robustly for both branch refs and raw SHAs (avoid double-prefix issues) | |
| SDK_SHA=$(git rev-parse --verify "$SDK_REF^{commit}" 2>/dev/null || \ | |
| git rev-parse --verify "origin/$SDK_REF^{commit}" 2>/dev/null || \ | |
| echo "$SDK_REF") | |
| PR_NUMBER="" | |
| REASON="${{ github.event.inputs.reason }}" | |
| if [ -z "$REASON" ]; then | |
| REASON="manual" | |
| fi | |
| TRIGGER_DESCRIPTION="Manual trigger: ${REASON}" | |
| fi | |
| # Normalize and validate model IDs | |
| MODELS_INPUT="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.model_ids || '' }}" | |
| if [ -z "$MODELS_INPUT" ]; then | |
| MODELS_INPUT="$DEFAULT_MODEL" | |
| fi | |
| MODELS=$(printf '%s' "$MODELS_INPUT" | tr ', ' '\n' | sed '/^$/d' | paste -sd, -) | |
| ALLOWED_LIST=$(echo "$ALLOWED_MODEL_IDS_JSON" | jq -r '.[]') | |
| for MODEL in ${MODELS//,/ }; do | |
| if ! echo "$ALLOWED_LIST" | grep -Fx "$MODEL" >/dev/null; then | |
| echo "Model ID '$MODEL' not found in models.json" >&2 | |
| echo "Available models: $(echo "$ALLOWED_LIST" | paste -sd, -)" >&2 | |
| exit 1 | |
| fi | |
| done | |
| # Sanitize values to avoid GITHUB_OUTPUT parse errors (e.g., raw SHAs) | |
| SDK_SHA=$(printf '%s' "$SDK_SHA" | tr -d '\n\r') | |
| EVAL_LIMIT=$(printf '%s' "$EVAL_LIMIT" | tr -d '\n\r') | |
| PR_NUMBER=$(printf '%s' "$PR_NUMBER" | tr -d '\n\r') | |
| MODELS=$(printf '%s' "$MODELS" | tr -d '\n\r') | |
| TRIGGER_DESCRIPTION=$(printf '%s' "$TRIGGER_DESCRIPTION" | tr -d '\n\r') | |
| printf 'eval_limit=%s\n' "$EVAL_LIMIT" >> "$GITHUB_OUTPUT" | |
| printf 'sdk_sha=%s\n' "$SDK_SHA" >> "$GITHUB_OUTPUT" | |
| printf 'models=%s\n' "$MODELS" >> "$GITHUB_OUTPUT" | |
| printf 'pr_number=%s\n' "$PR_NUMBER" >> "$GITHUB_OUTPUT" | |
| printf 'trigger_desc=%s\n' "$TRIGGER_DESCRIPTION" >> "$GITHUB_OUTPUT" | |
| - name: Resolve model configurations and verify availability | |
| id: resolve-models | |
| env: | |
| MODEL_IDS: ${{ steps.params.outputs.models }} | |
| LLM_API_KEY: ${{ secrets.LLM_API_KEY_EVAL }} | |
| LLM_BASE_URL: https://llm-proxy.eval.all-hands.dev | |
| run: | | |
| uv run python .github/run-eval/resolve_model_config.py | |
| - name: Dispatch evaluation workflow | |
| env: | |
| SDK_SHA: ${{ steps.params.outputs.sdk_sha }} | |
| EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }} | |
| MODELS_JSON: ${{ steps.resolve-models.outputs.models_json }} | |
| EVAL_REPO: ${{ env.EVAL_REPO }} | |
| EVAL_WORKFLOW: ${{ env.EVAL_WORKFLOW }} | |
| EVAL_BRANCH: ${{ github.event.inputs.eval_branch || 'main' }} | |
| BENCHMARKS_BRANCH: ${{ github.event.inputs.benchmarks_branch || 'main' }} | |
| BENCHMARK: ${{ github.event.inputs.benchmark || 'swebench' }} | |
| TRIGGER_REASON: ${{ github.event.inputs.reason }} | |
| PR_NUMBER: ${{ steps.params.outputs.pr_number }} | |
| INSTANCE_IDS: ${{ github.event.inputs.instance_ids || '' }} | |
| NUM_INFER_WORKERS: ${{ github.event.inputs.num_infer_workers || '' }} | |
| NUM_EVAL_WORKERS: ${{ github.event.inputs.num_eval_workers || '' }} | |
| ENABLE_CONVERSATION_EVENT_LOGGING: ${{ github.event.inputs.enable_conversation_event_logging || false }} | |
| MAX_RETRIES: ${{ github.event.inputs.max_retries || '3' }} | |
| TOOL_PRESET: ${{ github.event.inputs.tool_preset || 'default' }} | |
| AGENT_TYPE: ${{ github.event.inputs.agent_type || 'default' }} | |
| PARTIAL_ARCHIVE_URL: ${{ github.event.inputs.partial_archive_url || '' }} | |
| TRIGGERED_BY: ${{ github.actor }} | |
| run: | | |
| # Normalize instance_ids: strip all spaces | |
| INSTANCE_IDS=$(printf '%s' "$INSTANCE_IDS" | tr -d ' ') | |
| echo "Dispatching evaluation workflow with SDK commit: $SDK_SHA (benchmark: $BENCHMARK, eval branch: $EVAL_BRANCH, benchmarks branch: $BENCHMARKS_BRANCH, tool preset: $TOOL_PRESET)" | |
| PAYLOAD=$(jq -n \ | |
| --arg sdk "$SDK_SHA" \ | |
| --arg sdk_run_id "${{ github.run_id }}" \ | |
| --arg eval_limit "$EVAL_LIMIT" \ | |
| --argjson models "$MODELS_JSON" \ | |
| --arg ref "$EVAL_BRANCH" \ | |
| --arg reason "$TRIGGER_REASON" \ | |
| --arg pr "$PR_NUMBER" \ | |
| --arg benchmarks "$BENCHMARKS_BRANCH" \ | |
| --arg benchmark "$BENCHMARK" \ | |
| --arg instance_ids "$INSTANCE_IDS" \ | |
| --arg num_infer_workers "$NUM_INFER_WORKERS" \ | |
| --arg num_eval_workers "$NUM_EVAL_WORKERS" \ | |
| --argjson enable_conversation_event_logging "$ENABLE_CONVERSATION_EVENT_LOGGING" \ | |
| --arg max_retries "$MAX_RETRIES" \ | |
| --arg tool_preset "$TOOL_PRESET" \ | |
| --arg agent_type "$AGENT_TYPE" \ | |
| --arg partial_archive_url "$PARTIAL_ARCHIVE_URL" \ | |
| --arg triggered_by "$TRIGGERED_BY" \ | |
| '{ref: $ref, inputs: {sdk_commit: $sdk, sdk_workflow_run_id: $sdk_run_id, eval_limit: $eval_limit, models_json: ($models | tostring), trigger_reason: $reason, pr_number: $pr, benchmarks_branch: $benchmarks, benchmark: $benchmark, instance_ids: $instance_ids, num_infer_workers: $num_infer_workers, num_eval_workers: $num_eval_workers, enable_conversation_event_logging: $enable_conversation_event_logging, max_retries: $max_retries, tool_preset: $tool_preset, agent_type: $agent_type, partial_archive_url: $partial_archive_url, triggered_by: $triggered_by}}') | |
| RESPONSE=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" -X POST \ | |
| -H "Authorization: token $PAT_TOKEN" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| -d "$PAYLOAD" \ | |
| "https://api.github.com/repos/${EVAL_REPO}/actions/workflows/${EVAL_WORKFLOW}/dispatches") | |
| if [ "$RESPONSE" != "204" ]; then | |
| echo "Dispatch failed (status $RESPONSE):" >&2 | |
| cat /tmp/dispatch.out >&2 | |
| exit 1 | |
| fi | |
| - name: Comment on PR | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| SDK_SHA: ${{ steps.params.outputs.sdk_sha }} | |
| EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }} | |
| MODELS: ${{ steps.params.outputs.models }} | |
| TRIGGER_DESC: ${{ steps.params.outputs.trigger_desc }} | |
| EVENT_NAME: ${{ github.event_name }} | |
| PR_NUMBER_INPUT: ${{ steps.params.outputs.pr_number }} | |
| run: | | |
| set -euo pipefail | |
| PR_NUMBER="$PR_NUMBER_INPUT" | |
| if [ "$EVENT_NAME" = "release" ] && [ -z "$PR_NUMBER" ]; then | |
| # Attempt to find the merged PR for this commit | |
| PR_NUMBER=$(curl -sS \ | |
| -H "Authorization: Bearer $GITHUB_TOKEN" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/${{ github.repository }}/commits/${SDK_SHA}/pulls" \ | |
| | jq -r '.[0].number // ""') | |
| fi | |
| if [ -z "$PR_NUMBER" ]; then | |
| echo "No PR found to comment on; skipping comment" | |
| exit 0 | |
| fi | |
| COMMENT_BODY=$(printf '**Evaluation Triggered**\n\n- Trigger: %s\n- SDK: %s\n- Eval limit: %s\n- Models: %s\n' \ | |
| "$TRIGGER_DESC" "$SDK_SHA" "$EVAL_LIMIT" "$MODELS") | |
| curl -sS -X POST \ | |
| -H "Accept: application/vnd.github+json" \ | |
| -H "Authorization: Bearer $GITHUB_TOKEN" \ | |
| "https://api.github.com/repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \ | |
| -d "$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')" |