Skip to content

Run Integration Tests dependencies #4072

Run Integration Tests dependencies

Run Integration Tests dependencies #4072

---
name: Run Integration Tests
run-name: >-
Run Integration Tests ${{ inputs.reason || github.event.label.name || 'scheduled' }}
on:
# Use pull_request_target to access secrets even on fork PRs
# This is safe because we only run when the 'integration-test' label is added by a maintainer
pull_request_target:
types:
- labeled
workflow_dispatch:
inputs:
reason:
description: Reason for manual trigger
required: true
default: ''
test_type:
description: Select which tests to run (all, integration, behavior)
required: false
default: all
model_ids:
description: >-
Comma-separated model IDs to test (from resolve_model_config.py).
Example: claude-sonnet-4-6,glm-4.7. Defaults to a standard set.
required: false
default: ''
type: string
issue_number:
description: Issue or PR number to post results to (optional)
required: false
default: ''
type: string
tool_preset:
description: >-
Tool preset for file editing (default, gemini, gpt5, planning).
'default' uses FileEditorTool, 'gemini' uses read_file/write_file/edit/list_directory,
'gpt5' uses apply_patch tool.
required: false
default: default
type: choice
options:
- default
- gemini
- gpt5
- planning
schedule:
- cron: 30 22 * * * # Runs at 10:30pm UTC every day
env:
N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation
# Default models for scheduled/label-triggered runs (subset of models from resolve_model_config.py)
DEFAULT_MODEL_IDS: claude-sonnet-4-6,deepseek-v3.2-reasoner,kimi-k2-thinking,gemini-3.1-pro
jobs:
setup-matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.resolve-models.outputs.matrix }}
issue_number: ${{ steps.resolve-issue.outputs.issue_number }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
ref: ${{ github.event.pull_request.head.sha || github.ref }}
persist-credentials: false
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.13'
- name: Resolve model configurations
id: resolve-models
env:
MODEL_IDS_INPUT: ${{ github.event.inputs.model_ids || '' }}
DEFAULT_MODEL_IDS: ${{ env.DEFAULT_MODEL_IDS }}
run: |
# Use input model_ids if provided, otherwise use defaults
if [ -z "$MODEL_IDS_INPUT" ]; then
MODEL_IDS="$DEFAULT_MODEL_IDS"
echo "No model_ids specified, using defaults: $MODEL_IDS"
else
MODEL_IDS="$MODEL_IDS_INPUT"
echo "Using specified model_ids: $MODEL_IDS"
fi
# Resolve model configs using resolve_model_config.py
# Transform output to matrix format for integration tests
MATRIX=$(python3 << EOF
import json
import sys
sys.path.insert(0, '.github/run-eval')
from resolve_model_config import MODELS
model_ids = "$MODEL_IDS".split(",")
model_ids = [m.strip() for m in model_ids if m.strip()]
matrix = []
for model_id in model_ids:
if model_id not in MODELS:
available = ", ".join(sorted(MODELS.keys()))
print(f"Error: Model ID '{model_id}' not found. Available: {available}", file=sys.stderr)
sys.exit(1)
model = MODELS[model_id]
# Create run-suffix from model id (replace special chars with underscore)
run_suffix = model_id.replace("-", "_").replace(".", "_") + "_run"
matrix.append({
"id": model_id,
"name": model["display_name"],
"run-suffix": run_suffix,
"llm-config": model["llm_config"]
})
print(json.dumps(matrix))
EOF
)
if [ $? -ne 0 ]; then
echo "Failed to resolve model configurations" >&2
exit 1
fi
echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
echo "Resolved models: $(echo "$MATRIX" | jq -r '.[].name' | paste -sd', ' -)"
- name: Resolve issue number
id: resolve-issue
env:
ISSUE_NUMBER_INPUT: ${{ github.event.inputs.issue_number || '' }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
# Priority: explicit input > PR number from label trigger
if [ -n "$ISSUE_NUMBER_INPUT" ]; then
echo "issue_number=$ISSUE_NUMBER_INPUT" >> "$GITHUB_OUTPUT"
elif [ -n "$PR_NUMBER" ]; then
echo "issue_number=$PR_NUMBER" >> "$GITHUB_OUTPUT"
else
echo "issue_number=" >> "$GITHUB_OUTPUT"
fi
# Post initial comment for label triggers (no dependencies - runs immediately)
post-label-comment:
if: >
github.event_name == 'pull_request_target' && (
github.event.label.name == 'integration-test' ||
github.event.label.name == 'behavior-test'
)
runs-on: ubuntu-latest
permissions:
pull-requests: write
steps:
- name: Comment on PR (integration tests via label)
if: github.event.label.name == 'integration-test'
uses: KeisukeYamashita/create-comment@v1
with:
unique: false
comment: |
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
- name: Comment on PR (behavior tests via label)
if: github.event.label.name == 'behavior-test'
uses: KeisukeYamashita/create-comment@v1
with:
unique: false
comment: |
Hi! I started running the behavior tests on your PR. You will receive a comment with the results shortly.
# Post initial comment for workflow_dispatch (depends on setup-matrix for issue_number resolution)
post-dispatch-comment:
needs: setup-matrix
if: github.event_name == 'workflow_dispatch' && github.event.inputs.issue_number != ''
runs-on: ubuntu-latest
permissions:
issues: write
steps:
- name: Comment on issue/PR (workflow_dispatch)
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
ISSUE_NUMBER: ${{ github.event.inputs.issue_number }}
MODEL_IDS: ${{ github.event.inputs.model_ids || 'all models' }}
TEST_TYPE: ${{ github.event.inputs.test_type || 'all' }}
REASON: ${{ github.event.inputs.reason }}
run: |
# Sanitize @OpenHands mentions to prevent self-mention loops
SANITIZED_REASON=$(echo "$REASON" | sed 's/@OpenHands/@\u200BOpenHands/g; s/@openhands/@\u200Bopenhands/g')
SANITIZED_MODEL_IDS=$(echo "$MODEL_IDS" | sed 's/@OpenHands/@\u200BOpenHands/g; s/@openhands/@\u200Bopenhands/g')
COMMENT_BODY=$(cat <<EOF
**Integration Tests Triggered**
- **Reason:** $SANITIZED_REASON
- **Test type:** $TEST_TYPE
- **Models:** $SANITIZED_MODEL_IDS
- **Workflow run:** ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
Results will be posted here when complete.
EOF
)
gh issue comment "$ISSUE_NUMBER" --body "$COMMENT_BODY"
run-integration-tests:
# Security: Only run when integration-related labels are present, via workflow_dispatch, or on schedule
# This prevents automatic execution on fork PRs without maintainer approval
# Note: uses always() to run even when comment jobs are skipped (e.g., for scheduled runs)
# Schedule trigger only runs in the main repository, not in forks
if: |
always() && (
(
github.event_name == 'pull_request_target' && (
github.event.label.name == 'integration-test' ||
github.event.label.name == 'behavior-test'
)
) ||
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
) && needs.setup-matrix.result == 'success'
needs: [setup-matrix, post-label-comment, post-dispatch-comment]
runs-on: ubuntu-22.04
timeout-minutes: 180
permissions:
contents: read
id-token: write
pull-requests: write
issues: write
strategy:
fail-fast: false
matrix:
python-version: ['3.13']
job-config: ${{ fromJson(needs.setup-matrix.outputs.matrix) }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
# For pull_request_target: checkout fork PR code (requires explicit repository)
# For other events: fallback to current repository and ref
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
ref: ${{ github.event.pull_request.head.sha || github.ref }}
# Security: Don't persist credentials to prevent untrusted PR code from using them
persist-credentials: false
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
version: latest
python-version: ${{ matrix.python-version }}
- name: Install Python dependencies using uv
run: |
uv sync --dev
uv pip install pytest
# Run integration test evaluation
- name: Determine test selection
run: |
TEST_TYPE_ARGS=""
if [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "behavior-test" ]; then
TEST_TYPE_ARGS="--test-type behavior"
echo "behavior-test label detected; running behavior tests only."
elif [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "integration-test" ]; then
TEST_TYPE_ARGS="--test-type integration"
echo "integration-test label detected; running integration tests only."
elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
test_type="${{ github.event.inputs.test_type }}"
case "$test_type" in
behavior)
TEST_TYPE_ARGS="--test-type behavior"
echo "workflow_dispatch requested behavior tests only."
;;
integration)
TEST_TYPE_ARGS="--test-type integration"
echo "workflow_dispatch requested integration tests only."
;;
""|all)
echo "workflow_dispatch requested full integration suite."
;;
*)
echo "workflow_dispatch provided unknown test_type '$test_type'; defaulting to full suite."
;;
esac
elif [ "${{ github.event_name }}" = "schedule" ]; then
TEST_TYPE_ARGS="--test-type integration"
echo "Scheduled run; running integration tests only."
else
echo "Running full integration test suite."
fi
echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV"
- name: Run integration test evaluation for ${{ matrix.job-config['name'] }}
env:
LLM_CONFIG: ${{ toJson(matrix.job-config['llm-config']) }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY_EVAL }}
LLM_BASE_URL: https://llm-proxy.eval.all-hands.dev
TOOL_PRESET: ${{ github.event.inputs.tool_preset || 'default' }}
run: |
set -eo pipefail
AGENT_SDK_VERSION=$(git rev-parse --short HEAD)
EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config['run-suffix'] }}"
echo "Invoking test runner with TEST_TYPE_ARGS='$TEST_TYPE_ARGS' TOOL_PRESET='$TOOL_PRESET'"
uv run python tests/integration/run_infer.py \
--llm-config "$LLM_CONFIG" \
--num-workers $N_PROCESSES \
--eval-note "$EVAL_NOTE" \
--tool-preset "$TOOL_PRESET" \
$TEST_TYPE_ARGS
# get integration tests JSON results
RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config['run-suffix'] }}* -name "results.json" -type f | head -n 1)
echo "RESULTS_FILE: $RESULTS_FILE"
if [ -f "$RESULTS_FILE" ]; then
echo "JSON_RESULTS_FILE=$RESULTS_FILE" >> $GITHUB_ENV
else
echo "JSON_RESULTS_FILE=" >> $GITHUB_ENV
fi
- name: Wait a little bit
run: sleep 10
- name: Create archive of evaluation outputs
run: |
TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
cd tests/integration/outputs # Change to the outputs directory
tar -czvf ../../../integration_tests_${{ matrix.job-config['run-suffix'] }}_${TIMESTAMP}.tar.gz *${{ matrix.job-config['run-suffix'] }}* # Include result directories for this model
- name: Upload evaluation results as artifact
uses: actions/upload-artifact@v7
id: upload_results_artifact
with:
name: integration-test-outputs-${{ matrix.job-config['run-suffix'] }}-${{ github.run_id }}-${{ github.run_attempt }}
path: integration_tests_${{ matrix.job-config['run-suffix'] }}_*.tar.gz
- name: Save test results for consolidation
run: |
# Copy the structured JSON results file for consolidation
mkdir -p test_results_summary
if [ -n "${{ env.JSON_RESULTS_FILE }}" ] && [ -f "${{ env.JSON_RESULTS_FILE }}" ]; then
# Copy the JSON results file directly
cp "${{ env.JSON_RESULTS_FILE }}" "test_results_summary/${{ matrix.job-config['run-suffix'] }}_results.json"
echo "✓ Copied JSON results file for consolidation"
else
echo "✗ No JSON results file found"
exit 1
fi
- name: Upload test results summary
uses: actions/upload-artifact@v7
with:
name: test-results-${{ matrix.job-config['run-suffix'] }}
path: test_results_summary/${{ matrix.job-config['run-suffix'] }}_results.json
consolidate-results:
needs: [setup-matrix, run-integration-tests]
if: |
always() && (
(
github.event_name == 'pull_request_target' && (
github.event.label.name == 'integration-test' ||
github.event.label.name == 'behavior-test'
)
) ||
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
)
runs-on: ubuntu-24.04
permissions:
contents: read
pull-requests: write
issues: write
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
# When using pull_request_target, explicitly checkout the PR branch
# This ensures we use the scripts from the actual PR code
ref: ${{ github.event.pull_request.head.sha || github.ref }}
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
version: latest
python-version: '3.13'
- name: Install Python dependencies using uv
run: |
uv sync --dev
- name: Download all test results
uses: actions/download-artifact@v8
with:
pattern: test-results-*
merge-multiple: true
path: all_results
- name: Download all integration test artifacts
uses: actions/download-artifact@v8
with:
pattern: integration-test-outputs-*
path: artifacts
- name: Consolidate test results
env:
EVENT_NAME: ${{ github.event_name }}
PR_NUMBER: ${{ github.event.pull_request.number }}
MANUAL_REASON: ${{ github.event.inputs.reason }}
COMMIT_SHA: ${{ github.sha }}
PYTHONPATH: ${{ github.workspace }}
GITHUB_SERVER_URL: ${{ github.server_url }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_RUN_ID: ${{ github.run_id }}
run: |
uv run python tests/integration/utils/consolidate_json_results.py \
--results-dir all_results \
--artifacts-dir artifacts \
--output-file consolidated_results.json
echo "Consolidated results generated successfully"
uv run python tests/integration/utils/generate_markdown_report.py \
--input-file consolidated_results.json \
--output-file consolidated_report.md
- name: Upload consolidated report
uses: actions/upload-artifact@v7
with:
name: consolidated-report
path: consolidated_report.md
- name: Create consolidated PR comment
if: github.event_name == 'pull_request_target'
run: |
# Sanitize @OpenHands mentions to prevent self-mention loops
COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md)
# Use GitHub CLI to create comment with explicit PR number
echo "$COMMENT_BODY" | gh pr comment ${{ github.event.pull_request.number }} --body-file -
env:
GH_TOKEN: ${{ github.token }}
- name: Comment on specified issue/PR (workflow_dispatch)
if: github.event_name == 'workflow_dispatch' && needs.setup-matrix.outputs.issue_number != ''
env:
GH_TOKEN: ${{ github.token }}
ISSUE_NUMBER: ${{ needs.setup-matrix.outputs.issue_number }}
run: |
# Sanitize @OpenHands mentions to prevent self-mention loops
COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md)
# Use GitHub CLI to create comment on the specified issue/PR
echo "$COMMENT_BODY" | gh issue comment "$ISSUE_NUMBER" --body-file -
- name: Read consolidated report for tracker issue
if: github.event_name == 'schedule'
id: read_report
run: |
# Read and sanitize the report, then set as output
REPORT_CONTENT=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md)
echo "report<<EOF" >> $GITHUB_OUTPUT
echo "$REPORT_CONTENT" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
- name: Comment with results on tracker issue
if: github.event_name == 'schedule'
uses: KeisukeYamashita/create-comment@v1
with:
number: 2078
unique: false
comment: |
**Trigger:** Nightly Scheduled Run
**Commit:** ${{ github.sha }}
${{ steps.read_report.outputs.report }}