Run Eval (swebench) test fix/tool-call-compat-shim #3086

Workflow file for this run

.github/workflows/run-eval.yml at b826597

	---
	name: Run Eval
	run-name: Run Eval (${{ inputs.benchmark \|\| 'swebench' }}) ${{ inputs.reason \|\| github.event.label.name \|\| 'release' }}

	on:
	pull_request_target:
	types: [labeled]
	release:
	types: [published]
	workflow_dispatch:
	inputs:
	benchmark:
	description: Benchmark to evaluate
	required: false
	default: swebench
	type: choice
	options:
	- gaia
	- swebench
	- swtbench
	- commit0
	- swebenchmultimodal
	- terminalbench
	sdk_ref:
	description: SDK commit/ref to evaluate (must be a semantic version like v1.0.0 unless 'Allow unreleased branches' is checked)
	required: true
	default: v1.16.1








	allow_unreleased_branches:
	description: Allow unreleased branches (bypasses semantic version requirement)
	required: false
	default: false
	type: boolean
	eval_limit:
	description: Number of instances to run (any positive integer)
	required: false
	default: '1'
	type: string
	model_ids:
	description: Comma-separated model IDs to evaluate. Must be keys of MODELS in resolve_model_config.py. Defaults to first model in that
	dict.
	required: false
	default: ''
	type: string
	reason:
	description: Reason for manual trigger
	required: false
	default: ''
	eval_branch:
	description: Evaluation repo branch to use (for testing feature branches)
	required: false
	default: main
	type: string
	benchmarks_branch:
	description: Benchmarks repo branch to use (for testing feature branches)
	required: false
	default: main
	type: string
	instance_ids:
	description: >-
	Comma-separated instance IDs to evaluate.
	Example: "django__django-11583,django__django-12345".
	Spaces around commas are automatically stripped.
	Leave empty to evaluate all instances up to eval_limit.
	required: false
	default: ''
	num_infer_workers:
	description: Number of inference workers (optional, overrides benchmark default)
	required: false
	default: ''
	type: string
	num_eval_workers:
	description: Number of evaluation workers (optional, overrides benchmark default)
	required: false
	default: ''
	type: string
	enable_conversation_event_logging:
	description: 'Enable Datadog persistence for conversation events (default: true)'
	required: false
	default: true
	type: boolean
	max_retries:
	description: Max retries per instance (passed to benchmarks)
	required: false
	default: '3'
	type: string
	tool_preset:
	description: >-
	Tool preset for file editing. 'default' uses FileEditorTool,
	'gemini' uses read_file/write_file/edit/list_directory,
	'gpt5' uses apply_patch tool.
	required: false
	default: default
	type: choice
	options:
	- default
	- gemini
	- gpt5
	- planning
	agent_type:
	description: >-
	Agent type: 'default' for standard Agent,
	'acp-claude' for ACPAgent with Claude Code,
	'acp-codex' for ACPAgent with Codex,
	'acp-gemini' for ACPAgent with Gemini CLI.
	required: false
	default: default
	type: choice
	options:
	- default
	- acp-claude
	- acp-codex
	- acp-gemini
	partial_archive_url:
	description: Resume partial work from full archive tar.gz
	required: false
	default: ''
	type: string


	env:
	EVAL_REPO: OpenHands/evaluation
	EVAL_WORKFLOW: eval-job.yml

	jobs:
	print-parameters:
	if: >
	github.event_name == 'release' \|\|
	github.event_name == 'workflow_dispatch' \|\|
	(github.event_name == 'pull_request_target' &&
	(github.event.label.name == 'run-eval-1' \|\|
	github.event.label.name == 'run-eval-50' \|\|
	github.event.label.name == 'run-eval-200' \|\|
	github.event.label.name == 'run-eval-500'))
	runs-on: ubuntu-latest
	steps:
	- name: Print all parameters
	run: \|
	echo "=== Workflow Parameters ==="
	echo "Event: ${{ github.event_name }}"
	echo "Actor: ${{ github.actor }}"
	echo "Ref: ${{ github.ref }}"
	echo ""
	echo "=== Input Parameters ==="
	echo "benchmark: ${{ github.event.inputs.benchmark \|\| 'swebench' }}"
	echo "sdk_ref: ${{ github.event.inputs.sdk_ref \|\| 'N/A' }}"
	echo "allow_unreleased_branches: ${{ github.event.inputs.allow_unreleased_branches \|\| 'false' }}"
	echo "eval_limit: ${{ github.event.inputs.eval_limit \|\| '1' }}"
	echo "model_ids: ${{ github.event.inputs.model_ids \|\| '(default)' }}"
	echo "reason: ${{ github.event.inputs.reason \|\| 'N/A' }}"
	echo "eval_branch: ${{ github.event.inputs.eval_branch \|\| 'main' }}"
	echo "benchmarks_branch: ${{ github.event.inputs.benchmarks_branch \|\| 'main' }}"
	echo "instance_ids: ${{ github.event.inputs.instance_ids \|\| 'N/A' }}"
	echo "num_infer_workers: ${{ github.event.inputs.num_infer_workers \|\| '(default)' }}"
	echo "num_eval_workers: ${{ github.event.inputs.num_eval_workers \|\| '(default)' }}"
	echo "enable_conversation_event_logging: ${{ github.event.inputs.enable_conversation_event_logging \|\| 'true' }}"
	echo "max_retries: ${{ github.event.inputs.max_retries \|\| '3' }}"
	echo "tool_preset: ${{ github.event.inputs.tool_preset \|\| 'default' }}"
	echo "partial_archive_url: ${{ github.event.inputs.partial_archive_url \|\| 'N/A' }}"
	echo ""
	echo "=== Environment Variables ==="
	echo "EVAL_REPO: ${{ env.EVAL_REPO }}"
	echo "EVAL_WORKFLOW: ${{ env.EVAL_WORKFLOW }}"
	echo ""
	echo "=== Label (for PR events) ==="
	echo "Label: ${{ github.event.label.name \|\| 'N/A' }}"

	build-and-evaluate:
	needs: print-parameters
	runs-on: ubuntu-latest
	permissions:
	contents: read
	packages: write
	actions: write
	issues: write
	pull-requests: write

	steps:
	- name: Checkout sdk code (base for validation)
	uses: actions/checkout@v6
	with:
	ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.sdk_ref \|\| (github.event_name == 'pull_request_target' &&
	github.event.pull_request.base.ref \|\| github.ref) }}
	fetch-depth: 0

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.13'

	- name: Install uv
	uses: astral-sh/setup-uv@v7
	with:
	version: latest
	python-version: '3.13'

	- name: Validate eval_limit
	if: github.event_name == 'workflow_dispatch'
	run: \|
	if ! [[ "${{ github.event.inputs.eval_limit }}" =~ ^[1-9][0-9]*$ ]]; then
	echo "Error: eval_limit must be a positive integer, got: ${{ github.event.inputs.eval_limit }}"
	exit 1
	fi

	- name: Validate SDK reference and workflow branches
	if: github.event_name == 'workflow_dispatch'
	env:
	SDK_REF: ${{ github.event.inputs.sdk_ref }}
	ALLOW_UNRELEASED_BRANCHES: ${{ github.event.inputs.allow_unreleased_branches }}
	EVAL_BRANCH: ${{ github.event.inputs.eval_branch \|\| 'main' }}
	BENCHMARKS_BRANCH: ${{ github.event.inputs.benchmarks_branch \|\| 'main' }}
	run: \|
	python3 .github/run-eval/validate_sdk_ref.py

	- name: Sync locked workspace dependencies
	run: \|
	uv sync --frozen

	- name: Load model IDs from Python script
	id: load-models
	run: \|
	# Extract all model IDs from resolve_model_config.py
	ALLOWED_MODEL_IDS=$(uv run python << 'EOF'
	import sys
	sys.path.insert(0, '.github/run-eval')
	from resolve_model_config import MODELS
	import json
	print(json.dumps(list(MODELS.keys())))
	EOF
	)
	DEFAULT_MODEL=$(echo "$ALLOWED_MODEL_IDS" \| jq -r '.[0]')
	if [ -z "$DEFAULT_MODEL" ] \|\| [ "$DEFAULT_MODEL" = "null" ]; then
	echo "No models configured" >&2
	exit 1
	fi
	echo "allowed_model_ids=$ALLOWED_MODEL_IDS" >> "$GITHUB_OUTPUT"
	echo "default_model=$DEFAULT_MODEL" >> "$GITHUB_OUTPUT"

	- name: Resolve parameters
	id: params
	env:
	DEFAULT_MODEL: ${{ steps.load-models.outputs.default_model }}
	ALLOWED_MODEL_IDS_JSON: ${{ steps.load-models.outputs.allowed_model_ids }}
	PAT_TOKEN_DEFAULT: ${{ secrets.PAT_TOKEN }}
	run: \|
	set -euo pipefail

	# Set PAT token for cross-repo workflow dispatch
	PAT_TOKEN="$PAT_TOKEN_DEFAULT"
	if [ -z "$PAT_TOKEN" ]; then
	echo "Missing PAT token" >&2
	exit 1
	fi
	echo "PAT_TOKEN=$PAT_TOKEN" >> "$GITHUB_ENV"

	# Determine eval limit and SDK SHA based on trigger
	if [ "${{ github.event_name }}" = "pull_request_target" ]; then
	LABEL="${{ github.event.label.name }}"
	case "$LABEL" in
	run-eval-1) EVAL_LIMIT=1 ;;
	run-eval-50) EVAL_LIMIT=50 ;;
	run-eval-200) EVAL_LIMIT=200 ;;
	run-eval-500) EVAL_LIMIT=500 ;;
	*) echo "Unsupported label $LABEL" >&2; exit 1 ;;
	esac
	SDK_SHA="${{ github.event.pull_request.head.sha }}"
	PR_NUMBER="${{ github.event.pull_request.number }}"
	TRIGGER_DESCRIPTION="Label '${LABEL}' on PR #${PR_NUMBER}"
	elif [ "${{ github.event_name }}" = "release" ]; then
	EVAL_LIMIT=50
	# Use tag instead of target_commitish because release branches are automatically deleted after merge
	SDK_SHA=$(git rev-parse "${{ github.event.release.tag_name }}")
	PR_NUMBER=""
	TRIGGER_DESCRIPTION="Release ${{ github.event.release.tag_name }}"
	else
	EVAL_LIMIT="${{ github.event.inputs.eval_limit }}"
	SDK_REF="${{ github.event.inputs.sdk_ref }}"
	# Convert ref to SHA for manual dispatch
	# Resolve SHA robustly for both branch refs and raw SHAs (avoid double-prefix issues)
	SDK_SHA=$(git rev-parse --verify "$SDK_REF^{commit}" 2>/dev/null \|\| \
	git rev-parse --verify "origin/$SDK_REF^{commit}" 2>/dev/null \|\| \
	echo "$SDK_REF")
	PR_NUMBER=""
	REASON="${{ github.event.inputs.reason }}"
	if [ -z "$REASON" ]; then
	REASON="manual"
	fi
	TRIGGER_DESCRIPTION="Manual trigger: ${REASON}"
	fi

	# Normalize and validate model IDs
	MODELS_INPUT="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.model_ids \|\| '' }}"
	if [ -z "$MODELS_INPUT" ]; then
	MODELS_INPUT="$DEFAULT_MODEL"
	fi
	MODELS=$(printf '%s' "$MODELS_INPUT" \| tr ', ' '\n' \| sed '/^$/d' \| paste -sd, -)
	ALLOWED_LIST=$(echo "$ALLOWED_MODEL_IDS_JSON" \| jq -r '.[]')
	for MODEL in ${MODELS//,/ }; do
	if ! echo "$ALLOWED_LIST" \| grep -Fx "$MODEL" >/dev/null; then
	echo "Model ID '$MODEL' not found in models.json" >&2
	echo "Available models: $(echo "$ALLOWED_LIST" \| paste -sd, -)" >&2
	exit 1
	fi
	done

	# Sanitize values to avoid GITHUB_OUTPUT parse errors (e.g., raw SHAs)
	SDK_SHA=$(printf '%s' "$SDK_SHA" \| tr -d '\n\r')
	EVAL_LIMIT=$(printf '%s' "$EVAL_LIMIT" \| tr -d '\n\r')
	PR_NUMBER=$(printf '%s' "$PR_NUMBER" \| tr -d '\n\r')
	MODELS=$(printf '%s' "$MODELS" \| tr -d '\n\r')
	TRIGGER_DESCRIPTION=$(printf '%s' "$TRIGGER_DESCRIPTION" \| tr -d '\n\r')

	printf 'eval_limit=%s\n' "$EVAL_LIMIT" >> "$GITHUB_OUTPUT"
	printf 'sdk_sha=%s\n' "$SDK_SHA" >> "$GITHUB_OUTPUT"
	printf 'models=%s\n' "$MODELS" >> "$GITHUB_OUTPUT"
	printf 'pr_number=%s\n' "$PR_NUMBER" >> "$GITHUB_OUTPUT"
	printf 'trigger_desc=%s\n' "$TRIGGER_DESCRIPTION" >> "$GITHUB_OUTPUT"

	- name: Resolve model configurations and verify availability
	id: resolve-models
	env:
	MODEL_IDS: ${{ steps.params.outputs.models }}
	LLM_API_KEY: ${{ secrets.LLM_API_KEY_EVAL }}
	LLM_BASE_URL: https://llm-proxy.eval.all-hands.dev
	run: \|
	uv run python .github/run-eval/resolve_model_config.py

	- name: Dispatch evaluation workflow
	env:
	SDK_SHA: ${{ steps.params.outputs.sdk_sha }}
	EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }}
	MODELS_JSON: ${{ steps.resolve-models.outputs.models_json }}
	EVAL_REPO: ${{ env.EVAL_REPO }}
	EVAL_WORKFLOW: ${{ env.EVAL_WORKFLOW }}
	EVAL_BRANCH: ${{ github.event.inputs.eval_branch \|\| 'main' }}
	BENCHMARKS_BRANCH: ${{ github.event.inputs.benchmarks_branch \|\| 'main' }}
	BENCHMARK: ${{ github.event.inputs.benchmark \|\| 'swebench' }}
	TRIGGER_REASON: ${{ github.event.inputs.reason }}
	PR_NUMBER: ${{ steps.params.outputs.pr_number }}
	INSTANCE_IDS: ${{ github.event.inputs.instance_ids \|\| '' }}
	NUM_INFER_WORKERS: ${{ github.event.inputs.num_infer_workers \|\| '' }}
	NUM_EVAL_WORKERS: ${{ github.event.inputs.num_eval_workers \|\| '' }}
	ENABLE_CONVERSATION_EVENT_LOGGING: ${{ github.event.inputs.enable_conversation_event_logging \|\| false }}
	MAX_RETRIES: ${{ github.event.inputs.max_retries \|\| '3' }}
	TOOL_PRESET: ${{ github.event.inputs.tool_preset \|\| 'default' }}
	AGENT_TYPE: ${{ github.event.inputs.agent_type \|\| 'default' }}
	PARTIAL_ARCHIVE_URL: ${{ github.event.inputs.partial_archive_url \|\| '' }}
	TRIGGERED_BY: ${{ github.actor }}
	run: \|
	# Normalize instance_ids: strip all spaces
	INSTANCE_IDS=$(printf '%s' "$INSTANCE_IDS" \| tr -d ' ')

	echo "Dispatching evaluation workflow with SDK commit: $SDK_SHA (benchmark: $BENCHMARK, eval branch: $EVAL_BRANCH, benchmarks branch: $BENCHMARKS_BRANCH, tool preset: $TOOL_PRESET)"
	PAYLOAD=$(jq -n \
	--arg sdk "$SDK_SHA" \
	--arg sdk_run_id "${{ github.run_id }}" \
	--arg eval_limit "$EVAL_LIMIT" \
	--argjson models "$MODELS_JSON" \
	--arg ref "$EVAL_BRANCH" \
	--arg reason "$TRIGGER_REASON" \
	--arg pr "$PR_NUMBER" \
	--arg benchmarks "$BENCHMARKS_BRANCH" \
	--arg benchmark "$BENCHMARK" \
	--arg instance_ids "$INSTANCE_IDS" \
	--arg num_infer_workers "$NUM_INFER_WORKERS" \
	--arg num_eval_workers "$NUM_EVAL_WORKERS" \
	--argjson enable_conversation_event_logging "$ENABLE_CONVERSATION_EVENT_LOGGING" \
	--arg max_retries "$MAX_RETRIES" \
	--arg tool_preset "$TOOL_PRESET" \
	--arg agent_type "$AGENT_TYPE" \
	--arg partial_archive_url "$PARTIAL_ARCHIVE_URL" \
	--arg triggered_by "$TRIGGERED_BY" \
	'{ref: $ref, inputs: {sdk_commit: $sdk, sdk_workflow_run_id: $sdk_run_id, eval_limit: $eval_limit, models_json: ($models \| tostring), trigger_reason: $reason, pr_number: $pr, benchmarks_branch: $benchmarks, benchmark: $benchmark, instance_ids: $instance_ids, num_infer_workers: $num_infer_workers, num_eval_workers: $num_eval_workers, enable_conversation_event_logging: $enable_conversation_event_logging, max_retries: $max_retries, tool_preset: $tool_preset, agent_type: $agent_type, partial_archive_url: $partial_archive_url, triggered_by: $triggered_by}}')
	RESPONSE=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" -X POST \
	-H "Authorization: token $PAT_TOKEN" \
	-H "Accept: application/vnd.github+json" \
	-d "$PAYLOAD" \
	"https://api.github.com/repos/${EVAL_REPO}/actions/workflows/${EVAL_WORKFLOW}/dispatches")
	if [ "$RESPONSE" != "204" ]; then
	echo "Dispatch failed (status $RESPONSE):" >&2
	cat /tmp/dispatch.out >&2
	exit 1
	fi

	- name: Comment on PR
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	SDK_SHA: ${{ steps.params.outputs.sdk_sha }}
	EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }}
	MODELS: ${{ steps.params.outputs.models }}
	TRIGGER_DESC: ${{ steps.params.outputs.trigger_desc }}
	EVENT_NAME: ${{ github.event_name }}
	PR_NUMBER_INPUT: ${{ steps.params.outputs.pr_number }}
	run: \|
	set -euo pipefail
	PR_NUMBER="$PR_NUMBER_INPUT"
	if [ "$EVENT_NAME" = "release" ] && [ -z "$PR_NUMBER" ]; then
	# Attempt to find the merged PR for this commit
	PR_NUMBER=$(curl -sS \
	-H "Authorization: Bearer $GITHUB_TOKEN" \
	-H "Accept: application/vnd.github+json" \
	"https://api.github.com/repos/${{ github.repository }}/commits/${SDK_SHA}/pulls" \
	\| jq -r '.[0].number // ""')
	fi

	if [ -z "$PR_NUMBER" ]; then
	echo "No PR found to comment on; skipping comment"
	exit 0
	fi

	COMMENT_BODY=$(printf 'Evaluation Triggered\n\n- Trigger: %s\n- SDK: %s\n- Eval limit: %s\n- Models: %s\n' \
	"$TRIGGER_DESC" "$SDK_SHA" "$EVAL_LIMIT" "$MODELS")

	curl -sS -X POST \
	-H "Accept: application/vnd.github+json" \
	-H "Authorization: Bearer $GITHUB_TOKEN" \
	"https://api.github.com/repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \
	-d "$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Run Eval (swebench) test fix/tool-call-compat-shim #3086

Workflow file

Run Eval (swebench) test fix/tool-call-compat-shim #3086

Uh oh!

Workflow file for this run