Run Integration Tests dependencies #4072
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| name: Run Integration Tests | |
| run-name: >- | |
| Run Integration Tests ${{ inputs.reason || github.event.label.name || 'scheduled' }} | |
| on: | |
| # Use pull_request_target to access secrets even on fork PRs | |
| # This is safe because we only run when the 'integration-test' label is added by a maintainer | |
| pull_request_target: | |
| types: | |
| - labeled | |
| workflow_dispatch: | |
| inputs: | |
| reason: | |
| description: Reason for manual trigger | |
| required: true | |
| default: '' | |
| test_type: | |
| description: Select which tests to run (all, integration, behavior) | |
| required: false | |
| default: all | |
| model_ids: | |
| description: >- | |
| Comma-separated model IDs to test (from resolve_model_config.py). | |
| Example: claude-sonnet-4-6,glm-4.7. Defaults to a standard set. | |
| required: false | |
| default: '' | |
| type: string | |
| issue_number: | |
| description: Issue or PR number to post results to (optional) | |
| required: false | |
| default: '' | |
| type: string | |
| tool_preset: | |
| description: >- | |
| Tool preset for file editing (default, gemini, gpt5, planning). | |
| 'default' uses FileEditorTool, 'gemini' uses read_file/write_file/edit/list_directory, | |
| 'gpt5' uses apply_patch tool. | |
| required: false | |
| default: default | |
| type: choice | |
| options: | |
| - default | |
| - gemini | |
| - gpt5 | |
| - planning | |
| schedule: | |
| - cron: 30 22 * * * # Runs at 10:30pm UTC every day | |
| env: | |
| N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation | |
| # Default models for scheduled/label-triggered runs (subset of models from resolve_model_config.py) | |
| DEFAULT_MODEL_IDS: claude-sonnet-4-6,deepseek-v3.2-reasoner,kimi-k2-thinking,gemini-3.1-pro | |
| jobs: | |
| setup-matrix: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| matrix: ${{ steps.resolve-models.outputs.matrix }} | |
| issue_number: ${{ steps.resolve-issue.outputs.issue_number }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} | |
| ref: ${{ github.event.pull_request.head.sha || github.ref }} | |
| persist-credentials: false | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.13' | |
| - name: Resolve model configurations | |
| id: resolve-models | |
| env: | |
| MODEL_IDS_INPUT: ${{ github.event.inputs.model_ids || '' }} | |
| DEFAULT_MODEL_IDS: ${{ env.DEFAULT_MODEL_IDS }} | |
| run: | | |
| # Use input model_ids if provided, otherwise use defaults | |
| if [ -z "$MODEL_IDS_INPUT" ]; then | |
| MODEL_IDS="$DEFAULT_MODEL_IDS" | |
| echo "No model_ids specified, using defaults: $MODEL_IDS" | |
| else | |
| MODEL_IDS="$MODEL_IDS_INPUT" | |
| echo "Using specified model_ids: $MODEL_IDS" | |
| fi | |
| # Resolve model configs using resolve_model_config.py | |
| # Transform output to matrix format for integration tests | |
| MATRIX=$(python3 << EOF | |
| import json | |
| import sys | |
| sys.path.insert(0, '.github/run-eval') | |
| from resolve_model_config import MODELS | |
| model_ids = "$MODEL_IDS".split(",") | |
| model_ids = [m.strip() for m in model_ids if m.strip()] | |
| matrix = [] | |
| for model_id in model_ids: | |
| if model_id not in MODELS: | |
| available = ", ".join(sorted(MODELS.keys())) | |
| print(f"Error: Model ID '{model_id}' not found. Available: {available}", file=sys.stderr) | |
| sys.exit(1) | |
| model = MODELS[model_id] | |
| # Create run-suffix from model id (replace special chars with underscore) | |
| run_suffix = model_id.replace("-", "_").replace(".", "_") + "_run" | |
| matrix.append({ | |
| "id": model_id, | |
| "name": model["display_name"], | |
| "run-suffix": run_suffix, | |
| "llm-config": model["llm_config"] | |
| }) | |
| print(json.dumps(matrix)) | |
| EOF | |
| ) | |
| if [ $? -ne 0 ]; then | |
| echo "Failed to resolve model configurations" >&2 | |
| exit 1 | |
| fi | |
| echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT" | |
| echo "Resolved models: $(echo "$MATRIX" | jq -r '.[].name' | paste -sd', ' -)" | |
| - name: Resolve issue number | |
| id: resolve-issue | |
| env: | |
| ISSUE_NUMBER_INPUT: ${{ github.event.inputs.issue_number || '' }} | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| run: | | |
| # Priority: explicit input > PR number from label trigger | |
| if [ -n "$ISSUE_NUMBER_INPUT" ]; then | |
| echo "issue_number=$ISSUE_NUMBER_INPUT" >> "$GITHUB_OUTPUT" | |
| elif [ -n "$PR_NUMBER" ]; then | |
| echo "issue_number=$PR_NUMBER" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "issue_number=" >> "$GITHUB_OUTPUT" | |
| fi | |
| # Post initial comment for label triggers (no dependencies - runs immediately) | |
| post-label-comment: | |
| if: > | |
| github.event_name == 'pull_request_target' && ( | |
| github.event.label.name == 'integration-test' || | |
| github.event.label.name == 'behavior-test' | |
| ) | |
| runs-on: ubuntu-latest | |
| permissions: | |
| pull-requests: write | |
| steps: | |
| - name: Comment on PR (integration tests via label) | |
| if: github.event.label.name == 'integration-test' | |
| uses: KeisukeYamashita/create-comment@v1 | |
| with: | |
| unique: false | |
| comment: | | |
| Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. | |
| - name: Comment on PR (behavior tests via label) | |
| if: github.event.label.name == 'behavior-test' | |
| uses: KeisukeYamashita/create-comment@v1 | |
| with: | |
| unique: false | |
| comment: | | |
| Hi! I started running the behavior tests on your PR. You will receive a comment with the results shortly. | |
| # Post initial comment for workflow_dispatch (depends on setup-matrix for issue_number resolution) | |
| post-dispatch-comment: | |
| needs: setup-matrix | |
| if: github.event_name == 'workflow_dispatch' && github.event.inputs.issue_number != '' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| issues: write | |
| steps: | |
| - name: Comment on issue/PR (workflow_dispatch) | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| ISSUE_NUMBER: ${{ github.event.inputs.issue_number }} | |
| MODEL_IDS: ${{ github.event.inputs.model_ids || 'all models' }} | |
| TEST_TYPE: ${{ github.event.inputs.test_type || 'all' }} | |
| REASON: ${{ github.event.inputs.reason }} | |
| run: | | |
| # Sanitize @OpenHands mentions to prevent self-mention loops | |
| SANITIZED_REASON=$(echo "$REASON" | sed 's/@OpenHands/@\u200BOpenHands/g; s/@openhands/@\u200Bopenhands/g') | |
| SANITIZED_MODEL_IDS=$(echo "$MODEL_IDS" | sed 's/@OpenHands/@\u200BOpenHands/g; s/@openhands/@\u200Bopenhands/g') | |
| COMMENT_BODY=$(cat <<EOF | |
| **Integration Tests Triggered** | |
| - **Reason:** $SANITIZED_REASON | |
| - **Test type:** $TEST_TYPE | |
| - **Models:** $SANITIZED_MODEL_IDS | |
| - **Workflow run:** ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} | |
| Results will be posted here when complete. | |
| EOF | |
| ) | |
| gh issue comment "$ISSUE_NUMBER" --body "$COMMENT_BODY" | |
| run-integration-tests: | |
| # Security: Only run when integration-related labels are present, via workflow_dispatch, or on schedule | |
| # This prevents automatic execution on fork PRs without maintainer approval | |
| # Note: uses always() to run even when comment jobs are skipped (e.g., for scheduled runs) | |
| # Schedule trigger only runs in the main repository, not in forks | |
| if: | | |
| always() && ( | |
| ( | |
| github.event_name == 'pull_request_target' && ( | |
| github.event.label.name == 'integration-test' || | |
| github.event.label.name == 'behavior-test' | |
| ) | |
| ) || | |
| github.event_name == 'workflow_dispatch' || | |
| (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk') | |
| ) && needs.setup-matrix.result == 'success' | |
| needs: [setup-matrix, post-label-comment, post-dispatch-comment] | |
| runs-on: ubuntu-22.04 | |
| timeout-minutes: 180 | |
| permissions: | |
| contents: read | |
| id-token: write | |
| pull-requests: write | |
| issues: write | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| python-version: ['3.13'] | |
| job-config: ${{ fromJson(needs.setup-matrix.outputs.matrix) }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| with: | |
| # For pull_request_target: checkout fork PR code (requires explicit repository) | |
| # For other events: fallback to current repository and ref | |
| repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} | |
| ref: ${{ github.event.pull_request.head.sha || github.ref }} | |
| # Security: Don't persist credentials to prevent untrusted PR code from using them | |
| persist-credentials: false | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| version: latest | |
| python-version: ${{ matrix.python-version }} | |
| - name: Install Python dependencies using uv | |
| run: | | |
| uv sync --dev | |
| uv pip install pytest | |
| # Run integration test evaluation | |
| - name: Determine test selection | |
| run: | | |
| TEST_TYPE_ARGS="" | |
| if [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "behavior-test" ]; then | |
| TEST_TYPE_ARGS="--test-type behavior" | |
| echo "behavior-test label detected; running behavior tests only." | |
| elif [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "integration-test" ]; then | |
| TEST_TYPE_ARGS="--test-type integration" | |
| echo "integration-test label detected; running integration tests only." | |
| elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then | |
| test_type="${{ github.event.inputs.test_type }}" | |
| case "$test_type" in | |
| behavior) | |
| TEST_TYPE_ARGS="--test-type behavior" | |
| echo "workflow_dispatch requested behavior tests only." | |
| ;; | |
| integration) | |
| TEST_TYPE_ARGS="--test-type integration" | |
| echo "workflow_dispatch requested integration tests only." | |
| ;; | |
| ""|all) | |
| echo "workflow_dispatch requested full integration suite." | |
| ;; | |
| *) | |
| echo "workflow_dispatch provided unknown test_type '$test_type'; defaulting to full suite." | |
| ;; | |
| esac | |
| elif [ "${{ github.event_name }}" = "schedule" ]; then | |
| TEST_TYPE_ARGS="--test-type integration" | |
| echo "Scheduled run; running integration tests only." | |
| else | |
| echo "Running full integration test suite." | |
| fi | |
| echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV" | |
| - name: Run integration test evaluation for ${{ matrix.job-config['name'] }} | |
| env: | |
| LLM_CONFIG: ${{ toJson(matrix.job-config['llm-config']) }} | |
| LLM_API_KEY: ${{ secrets.LLM_API_KEY_EVAL }} | |
| LLM_BASE_URL: https://llm-proxy.eval.all-hands.dev | |
| TOOL_PRESET: ${{ github.event.inputs.tool_preset || 'default' }} | |
| run: | | |
| set -eo pipefail | |
| AGENT_SDK_VERSION=$(git rev-parse --short HEAD) | |
| EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config['run-suffix'] }}" | |
| echo "Invoking test runner with TEST_TYPE_ARGS='$TEST_TYPE_ARGS' TOOL_PRESET='$TOOL_PRESET'" | |
| uv run python tests/integration/run_infer.py \ | |
| --llm-config "$LLM_CONFIG" \ | |
| --num-workers $N_PROCESSES \ | |
| --eval-note "$EVAL_NOTE" \ | |
| --tool-preset "$TOOL_PRESET" \ | |
| $TEST_TYPE_ARGS | |
| # get integration tests JSON results | |
| RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config['run-suffix'] }}* -name "results.json" -type f | head -n 1) | |
| echo "RESULTS_FILE: $RESULTS_FILE" | |
| if [ -f "$RESULTS_FILE" ]; then | |
| echo "JSON_RESULTS_FILE=$RESULTS_FILE" >> $GITHUB_ENV | |
| else | |
| echo "JSON_RESULTS_FILE=" >> $GITHUB_ENV | |
| fi | |
| - name: Wait a little bit | |
| run: sleep 10 | |
| - name: Create archive of evaluation outputs | |
| run: | | |
| TIMESTAMP=$(date +'%y-%m-%d-%H-%M') | |
| cd tests/integration/outputs # Change to the outputs directory | |
| tar -czvf ../../../integration_tests_${{ matrix.job-config['run-suffix'] }}_${TIMESTAMP}.tar.gz *${{ matrix.job-config['run-suffix'] }}* # Include result directories for this model | |
| - name: Upload evaluation results as artifact | |
| uses: actions/upload-artifact@v7 | |
| id: upload_results_artifact | |
| with: | |
| name: integration-test-outputs-${{ matrix.job-config['run-suffix'] }}-${{ github.run_id }}-${{ github.run_attempt }} | |
| path: integration_tests_${{ matrix.job-config['run-suffix'] }}_*.tar.gz | |
| - name: Save test results for consolidation | |
| run: | | |
| # Copy the structured JSON results file for consolidation | |
| mkdir -p test_results_summary | |
| if [ -n "${{ env.JSON_RESULTS_FILE }}" ] && [ -f "${{ env.JSON_RESULTS_FILE }}" ]; then | |
| # Copy the JSON results file directly | |
| cp "${{ env.JSON_RESULTS_FILE }}" "test_results_summary/${{ matrix.job-config['run-suffix'] }}_results.json" | |
| echo "✓ Copied JSON results file for consolidation" | |
| else | |
| echo "✗ No JSON results file found" | |
| exit 1 | |
| fi | |
| - name: Upload test results summary | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: test-results-${{ matrix.job-config['run-suffix'] }} | |
| path: test_results_summary/${{ matrix.job-config['run-suffix'] }}_results.json | |
| consolidate-results: | |
| needs: [setup-matrix, run-integration-tests] | |
| if: | | |
| always() && ( | |
| ( | |
| github.event_name == 'pull_request_target' && ( | |
| github.event.label.name == 'integration-test' || | |
| github.event.label.name == 'behavior-test' | |
| ) | |
| ) || | |
| github.event_name == 'workflow_dispatch' || | |
| (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk') | |
| ) | |
| runs-on: ubuntu-24.04 | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| issues: write | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| with: | |
| # When using pull_request_target, explicitly checkout the PR branch | |
| # This ensures we use the scripts from the actual PR code | |
| ref: ${{ github.event.pull_request.head.sha || github.ref }} | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| version: latest | |
| python-version: '3.13' | |
| - name: Install Python dependencies using uv | |
| run: | | |
| uv sync --dev | |
| - name: Download all test results | |
| uses: actions/download-artifact@v8 | |
| with: | |
| pattern: test-results-* | |
| merge-multiple: true | |
| path: all_results | |
| - name: Download all integration test artifacts | |
| uses: actions/download-artifact@v8 | |
| with: | |
| pattern: integration-test-outputs-* | |
| path: artifacts | |
| - name: Consolidate test results | |
| env: | |
| EVENT_NAME: ${{ github.event_name }} | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| MANUAL_REASON: ${{ github.event.inputs.reason }} | |
| COMMIT_SHA: ${{ github.sha }} | |
| PYTHONPATH: ${{ github.workspace }} | |
| GITHUB_SERVER_URL: ${{ github.server_url }} | |
| GITHUB_REPOSITORY: ${{ github.repository }} | |
| GITHUB_RUN_ID: ${{ github.run_id }} | |
| run: | | |
| uv run python tests/integration/utils/consolidate_json_results.py \ | |
| --results-dir all_results \ | |
| --artifacts-dir artifacts \ | |
| --output-file consolidated_results.json | |
| echo "Consolidated results generated successfully" | |
| uv run python tests/integration/utils/generate_markdown_report.py \ | |
| --input-file consolidated_results.json \ | |
| --output-file consolidated_report.md | |
| - name: Upload consolidated report | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: consolidated-report | |
| path: consolidated_report.md | |
| - name: Create consolidated PR comment | |
| if: github.event_name == 'pull_request_target' | |
| run: | | |
| # Sanitize @OpenHands mentions to prevent self-mention loops | |
| COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md) | |
| # Use GitHub CLI to create comment with explicit PR number | |
| echo "$COMMENT_BODY" | gh pr comment ${{ github.event.pull_request.number }} --body-file - | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| - name: Comment on specified issue/PR (workflow_dispatch) | |
| if: github.event_name == 'workflow_dispatch' && needs.setup-matrix.outputs.issue_number != '' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| ISSUE_NUMBER: ${{ needs.setup-matrix.outputs.issue_number }} | |
| run: | | |
| # Sanitize @OpenHands mentions to prevent self-mention loops | |
| COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md) | |
| # Use GitHub CLI to create comment on the specified issue/PR | |
| echo "$COMMENT_BODY" | gh issue comment "$ISSUE_NUMBER" --body-file - | |
| - name: Read consolidated report for tracker issue | |
| if: github.event_name == 'schedule' | |
| id: read_report | |
| run: | | |
| # Read and sanitize the report, then set as output | |
| REPORT_CONTENT=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md) | |
| echo "report<<EOF" >> $GITHUB_OUTPUT | |
| echo "$REPORT_CONTENT" >> $GITHUB_OUTPUT | |
| echo "EOF" >> $GITHUB_OUTPUT | |
| - name: Comment with results on tracker issue | |
| if: github.event_name == 'schedule' | |
| uses: KeisukeYamashita/create-comment@v1 | |
| with: | |
| number: 2078 | |
| unique: false | |
| comment: | | |
| **Trigger:** Nightly Scheduled Run | |
| **Commit:** ${{ github.sha }} | |
| ${{ steps.read_report.outputs.report }} | |