Run Integration Tests dependencies #4072

Workflow file for this run

.github/workflows/integration-runner.yml at 573244c

	---
	name: Run Integration Tests
	run-name: >-
	Run Integration Tests ${{ inputs.reason \|\| github.event.label.name \|\| 'scheduled' }}

	on:
	# Use pull_request_target to access secrets even on fork PRs
	# This is safe because we only run when the 'integration-test' label is added by a maintainer
	pull_request_target:
	types:
	- labeled
	workflow_dispatch:
	inputs:
	reason:
	description: Reason for manual trigger
	required: true
	default: ''
	test_type:
	description: Select which tests to run (all, integration, behavior)
	required: false
	default: all
	model_ids:
	description: >-
	Comma-separated model IDs to test (from resolve_model_config.py).
	Example: claude-sonnet-4-6,glm-4.7. Defaults to a standard set.
	required: false
	default: ''
	type: string
	issue_number:
	description: Issue or PR number to post results to (optional)
	required: false
	default: ''
	type: string
	tool_preset:
	description: >-
	Tool preset for file editing (default, gemini, gpt5, planning).
	'default' uses FileEditorTool, 'gemini' uses read_file/write_file/edit/list_directory,
	'gpt5' uses apply_patch tool.
	required: false
	default: default
	type: choice
	options:
	- default
	- gemini
	- gpt5
	- planning
	schedule:
	- cron: 30 22 * * * # Runs at 10:30pm UTC every day

	env:
	N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation
	# Default models for scheduled/label-triggered runs (subset of models from resolve_model_config.py)
	DEFAULT_MODEL_IDS: claude-sonnet-4-6,deepseek-v3.2-reasoner,kimi-k2-thinking,gemini-3.1-pro

	jobs:
	setup-matrix:
	runs-on: ubuntu-latest
	outputs:
	matrix: ${{ steps.resolve-models.outputs.matrix }}
	issue_number: ${{ steps.resolve-issue.outputs.issue_number }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v6
	with:
	repository: ${{ github.event.pull_request.head.repo.full_name \|\| github.repository }}
	ref: ${{ github.event.pull_request.head.sha \|\| github.ref }}
	persist-credentials: false

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.13'

	- name: Resolve model configurations
	id: resolve-models
	env:
	MODEL_IDS_INPUT: ${{ github.event.inputs.model_ids \|\| '' }}
	DEFAULT_MODEL_IDS: ${{ env.DEFAULT_MODEL_IDS }}
	run: \|
	# Use input model_ids if provided, otherwise use defaults
	if [ -z "$MODEL_IDS_INPUT" ]; then
	MODEL_IDS="$DEFAULT_MODEL_IDS"
	echo "No model_ids specified, using defaults: $MODEL_IDS"
	else
	MODEL_IDS="$MODEL_IDS_INPUT"
	echo "Using specified model_ids: $MODEL_IDS"
	fi

	# Resolve model configs using resolve_model_config.py
	# Transform output to matrix format for integration tests
	MATRIX=$(python3 << EOF
	import json
	import sys
	sys.path.insert(0, '.github/run-eval')
	from resolve_model_config import MODELS

	model_ids = "$MODEL_IDS".split(",")
	model_ids = [m.strip() for m in model_ids if m.strip()]

	matrix = []
	for model_id in model_ids:
	if model_id not in MODELS:
	available = ", ".join(sorted(MODELS.keys()))
	print(f"Error: Model ID '{model_id}' not found. Available: {available}", file=sys.stderr)
	sys.exit(1)
	model = MODELS[model_id]
	# Create run-suffix from model id (replace special chars with underscore)
	run_suffix = model_id.replace("-", "_").replace(".", "_") + "_run"
	matrix.append({
	"id": model_id,
	"name": model["display_name"],
	"run-suffix": run_suffix,
	"llm-config": model["llm_config"]
	})

	print(json.dumps(matrix))
	EOF
	)

	if [ $? -ne 0 ]; then
	echo "Failed to resolve model configurations" >&2
	exit 1
	fi

	echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
	echo "Resolved models: $(echo "$MATRIX" \| jq -r '.[].name' \| paste -sd', ' -)"

	- name: Resolve issue number
	id: resolve-issue
	env:
	ISSUE_NUMBER_INPUT: ${{ github.event.inputs.issue_number \|\| '' }}
	PR_NUMBER: ${{ github.event.pull_request.number }}
	run: \|
	# Priority: explicit input > PR number from label trigger
	if [ -n "$ISSUE_NUMBER_INPUT" ]; then
	echo "issue_number=$ISSUE_NUMBER_INPUT" >> "$GITHUB_OUTPUT"
	elif [ -n "$PR_NUMBER" ]; then
	echo "issue_number=$PR_NUMBER" >> "$GITHUB_OUTPUT"
	else
	echo "issue_number=" >> "$GITHUB_OUTPUT"
	fi

	# Post initial comment for label triggers (no dependencies - runs immediately)
	post-label-comment:
	if: >
	github.event_name == 'pull_request_target' && (
	github.event.label.name == 'integration-test' \|\|
	github.event.label.name == 'behavior-test'
	)
	runs-on: ubuntu-latest
	permissions:
	pull-requests: write
	steps:
	- name: Comment on PR (integration tests via label)
	if: github.event.label.name == 'integration-test'
	uses: KeisukeYamashita/create-comment@v1
	with:
	unique: false
	comment: \|
	Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
	- name: Comment on PR (behavior tests via label)
	if: github.event.label.name == 'behavior-test'
	uses: KeisukeYamashita/create-comment@v1
	with:
	unique: false
	comment: \|
	Hi! I started running the behavior tests on your PR. You will receive a comment with the results shortly.

	# Post initial comment for workflow_dispatch (depends on setup-matrix for issue_number resolution)
	post-dispatch-comment:
	needs: setup-matrix
	if: github.event_name == 'workflow_dispatch' && github.event.inputs.issue_number != ''
	runs-on: ubuntu-latest
	permissions:
	issues: write
	steps:
	- name: Comment on issue/PR (workflow_dispatch)
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	ISSUE_NUMBER: ${{ github.event.inputs.issue_number }}
	MODEL_IDS: ${{ github.event.inputs.model_ids \|\| 'all models' }}
	TEST_TYPE: ${{ github.event.inputs.test_type \|\| 'all' }}
	REASON: ${{ github.event.inputs.reason }}
	run: \|
	# Sanitize @OpenHands mentions to prevent self-mention loops
	SANITIZED_REASON=$(echo "$REASON" \| sed 's/@OpenHands/@\u200BOpenHands/g; s/@openhands/@\u200Bopenhands/g')
	SANITIZED_MODEL_IDS=$(echo "$MODEL_IDS" \| sed 's/@OpenHands/@\u200BOpenHands/g; s/@openhands/@\u200Bopenhands/g')
	COMMENT_BODY=$(cat <<EOF
	Integration Tests Triggered

	- Reason: $SANITIZED_REASON
	- Test type: $TEST_TYPE
	- Models: $SANITIZED_MODEL_IDS
	- Workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

	Results will be posted here when complete.
	EOF
	)
	gh issue comment "$ISSUE_NUMBER" --body "$COMMENT_BODY"

	run-integration-tests:
	# Security: Only run when integration-related labels are present, via workflow_dispatch, or on schedule
	# This prevents automatic execution on fork PRs without maintainer approval
	# Note: uses always() to run even when comment jobs are skipped (e.g., for scheduled runs)
	# Schedule trigger only runs in the main repository, not in forks
	if: \|
	always() && (
	(
	github.event_name == 'pull_request_target' && (
	github.event.label.name == 'integration-test' \|\|
	github.event.label.name == 'behavior-test'
	)
	) \|\|
	github.event_name == 'workflow_dispatch' \|\|
	(github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
	) && needs.setup-matrix.result == 'success'
	needs: [setup-matrix, post-label-comment, post-dispatch-comment]
	runs-on: ubuntu-22.04
	timeout-minutes: 180
	permissions:
	contents: read
	id-token: write
	pull-requests: write
	issues: write
	strategy:
	fail-fast: false
	matrix:
	python-version: ['3.13']
	job-config: ${{ fromJson(needs.setup-matrix.outputs.matrix) }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v6
	with:
	# For pull_request_target: checkout fork PR code (requires explicit repository)
	# For other events: fallback to current repository and ref
	repository: ${{ github.event.pull_request.head.repo.full_name \|\| github.repository }}
	ref: ${{ github.event.pull_request.head.sha \|\| github.ref }}
	# Security: Don't persist credentials to prevent untrusted PR code from using them
	persist-credentials: false

	- name: Install uv
	uses: astral-sh/setup-uv@v7
	with:
	version: latest
	python-version: ${{ matrix.python-version }}

	- name: Install Python dependencies using uv
	run: \|
	uv sync --dev
	uv pip install pytest

	# Run integration test evaluation
	- name: Determine test selection
	run: \|
	TEST_TYPE_ARGS=""
	if [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "behavior-test" ]; then
	TEST_TYPE_ARGS="--test-type behavior"
	echo "behavior-test label detected; running behavior tests only."
	elif [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "integration-test" ]; then
	TEST_TYPE_ARGS="--test-type integration"
	echo "integration-test label detected; running integration tests only."
	elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
	test_type="${{ github.event.inputs.test_type }}"
	case "$test_type" in
	behavior)
	TEST_TYPE_ARGS="--test-type behavior"
	echo "workflow_dispatch requested behavior tests only."
	;;
	integration)
	TEST_TYPE_ARGS="--test-type integration"
	echo "workflow_dispatch requested integration tests only."
	;;
	""\|all)
	echo "workflow_dispatch requested full integration suite."
	;;
	*)
	echo "workflow_dispatch provided unknown test_type '$test_type'; defaulting to full suite."
	;;
	esac
	elif [ "${{ github.event_name }}" = "schedule" ]; then
	TEST_TYPE_ARGS="--test-type integration"
	echo "Scheduled run; running integration tests only."
	else
	echo "Running full integration test suite."
	fi
	echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV"

	- name: Run integration test evaluation for ${{ matrix.job-config['name'] }}
	env:
	LLM_CONFIG: ${{ toJson(matrix.job-config['llm-config']) }}
	LLM_API_KEY: ${{ secrets.LLM_API_KEY_EVAL }}
	LLM_BASE_URL: https://llm-proxy.eval.all-hands.dev
	TOOL_PRESET: ${{ github.event.inputs.tool_preset \|\| 'default' }}
	run: \|
	set -eo pipefail

	AGENT_SDK_VERSION=$(git rev-parse --short HEAD)
	EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config['run-suffix'] }}"

	echo "Invoking test runner with TEST_TYPE_ARGS='$TEST_TYPE_ARGS' TOOL_PRESET='$TOOL_PRESET'"

	uv run python tests/integration/run_infer.py \
	--llm-config "$LLM_CONFIG" \
	--num-workers $N_PROCESSES \
	--eval-note "$EVAL_NOTE" \
	--tool-preset "$TOOL_PRESET" \
	$TEST_TYPE_ARGS

	# get integration tests JSON results
	RESULTS_FILE=$(find tests/integration/outputs/${{ matrix.job-config['run-suffix'] }} -name "results.json" -type f \| head -n 1)
	echo "RESULTS_FILE: $RESULTS_FILE"
	if [ -f "$RESULTS_FILE" ]; then
	echo "JSON_RESULTS_FILE=$RESULTS_FILE" >> $GITHUB_ENV
	else
	echo "JSON_RESULTS_FILE=" >> $GITHUB_ENV
	fi

	- name: Wait a little bit
	run: sleep 10





	- name: Create archive of evaluation outputs
	run: \|
	TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
	cd tests/integration/outputs # Change to the outputs directory
	tar -czvf ../../../integration_tests_${{ matrix.job-config['run-suffix'] }}_${TIMESTAMP}.tar.gz ${{ matrix.job-config['run-suffix'] }} # Include result directories for this model

	- name: Upload evaluation results as artifact
	uses: actions/upload-artifact@v7
	id: upload_results_artifact
	with:
	name: integration-test-outputs-${{ matrix.job-config['run-suffix'] }}-${{ github.run_id }}-${{ github.run_attempt }}
	path: integration_tests_${{ matrix.job-config['run-suffix'] }}_*.tar.gz

	- name: Save test results for consolidation
	run: \|
	# Copy the structured JSON results file for consolidation
	mkdir -p test_results_summary

	if [ -n "${{ env.JSON_RESULTS_FILE }}" ] && [ -f "${{ env.JSON_RESULTS_FILE }}" ]; then
	# Copy the JSON results file directly
	cp "${{ env.JSON_RESULTS_FILE }}" "test_results_summary/${{ matrix.job-config['run-suffix'] }}_results.json"
	echo "✓ Copied JSON results file for consolidation"
	else
	echo "✗ No JSON results file found"
	exit 1
	fi

	- name: Upload test results summary
	uses: actions/upload-artifact@v7
	with:
	name: test-results-${{ matrix.job-config['run-suffix'] }}
	path: test_results_summary/${{ matrix.job-config['run-suffix'] }}_results.json

	consolidate-results:
	needs: [setup-matrix, run-integration-tests]
	if: \|
	always() && (
	(
	github.event_name == 'pull_request_target' && (
	github.event.label.name == 'integration-test' \|\|
	github.event.label.name == 'behavior-test'
	)
	) \|\|
	github.event_name == 'workflow_dispatch' \|\|
	(github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
	)
	runs-on: ubuntu-24.04
	permissions:
	contents: read
	pull-requests: write
	issues: write
	steps:
	- name: Checkout repository
	uses: actions/checkout@v6
	with:
	# When using pull_request_target, explicitly checkout the PR branch
	# This ensures we use the scripts from the actual PR code
	ref: ${{ github.event.pull_request.head.sha \|\| github.ref }}

	- name: Install uv
	uses: astral-sh/setup-uv@v7
	with:
	version: latest
	python-version: '3.13'

	- name: Install Python dependencies using uv
	run: \|
	uv sync --dev

	- name: Download all test results
	uses: actions/download-artifact@v8
	with:
	pattern: test-results-*
	merge-multiple: true
	path: all_results

	- name: Download all integration test artifacts
	uses: actions/download-artifact@v8
	with:
	pattern: integration-test-outputs-*
	path: artifacts

	- name: Consolidate test results
	env:
	EVENT_NAME: ${{ github.event_name }}
	PR_NUMBER: ${{ github.event.pull_request.number }}
	MANUAL_REASON: ${{ github.event.inputs.reason }}
	COMMIT_SHA: ${{ github.sha }}
	PYTHONPATH: ${{ github.workspace }}
	GITHUB_SERVER_URL: ${{ github.server_url }}
	GITHUB_REPOSITORY: ${{ github.repository }}
	GITHUB_RUN_ID: ${{ github.run_id }}
	run: \|
	uv run python tests/integration/utils/consolidate_json_results.py \
	--results-dir all_results \
	--artifacts-dir artifacts \
	--output-file consolidated_results.json

	echo "Consolidated results generated successfully"

	uv run python tests/integration/utils/generate_markdown_report.py \
	--input-file consolidated_results.json \
	--output-file consolidated_report.md

	- name: Upload consolidated report
	uses: actions/upload-artifact@v7
	with:
	name: consolidated-report
	path: consolidated_report.md

	- name: Create consolidated PR comment
	if: github.event_name == 'pull_request_target'
	run: \|
	# Sanitize @OpenHands mentions to prevent self-mention loops
	COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md)
	# Use GitHub CLI to create comment with explicit PR number
	echo "$COMMENT_BODY" \| gh pr comment ${{ github.event.pull_request.number }} --body-file -
	env:
	GH_TOKEN: ${{ github.token }}

	- name: Comment on specified issue/PR (workflow_dispatch)
	if: github.event_name == 'workflow_dispatch' && needs.setup-matrix.outputs.issue_number != ''
	env:
	GH_TOKEN: ${{ github.token }}
	ISSUE_NUMBER: ${{ needs.setup-matrix.outputs.issue_number }}
	run: \|
	# Sanitize @OpenHands mentions to prevent self-mention loops
	COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md)
	# Use GitHub CLI to create comment on the specified issue/PR
	echo "$COMMENT_BODY" \| gh issue comment "$ISSUE_NUMBER" --body-file -

	- name: Read consolidated report for tracker issue
	if: github.event_name == 'schedule'
	id: read_report
	run: \|
	# Read and sanitize the report, then set as output
	REPORT_CONTENT=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md)
	echo "report<<EOF" >> $GITHUB_OUTPUT
	echo "$REPORT_CONTENT" >> $GITHUB_OUTPUT
	echo "EOF" >> $GITHUB_OUTPUT

	- name: Comment with results on tracker issue
	if: github.event_name == 'schedule'
	uses: KeisukeYamashita/create-comment@v1
	with:
	number: 2078
	unique: false
	comment: \|
	Trigger: Nightly Scheduled Run
	Commit: ${{ github.sha }}

	${{ steps.read_report.outputs.report }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Run Integration Tests dependencies #4072

Workflow file

Run Integration Tests dependencies #4072

Uh oh!

Workflow file for this run