Merge branch 'main' into fix/tool-call-compat-shim

juanmichelini · web-flow · commit 909d6b9bedd7 · 2026-04-09T17:59:14.000-03:00
diff --git a/.github/run-eval/ADDINGMODEL.md b/.github/run-eval/ADDINGMODEL.md
@@ -240,29 +240,44 @@ cd .github/run-eval
 MODEL_IDS="your-model-id" GITHUB_OUTPUT=/tmp/output.txt python resolve_model_config.py
 ```
 
-## Step 6: Run Integration Tests (Required Before PR)
+## Step 6: Create Draft PR
 
-**Mandatory**: Integration tests must pass before creating PR.
+Push your branch and create a draft PR. Note the PR number returned - you'll need it for the integration tests.
 
-### Via GitHub Actions
+## Step 7: Run Integration Tests
 
-1. Push branch: `git push origin your-branch-name`
-2. Navigate to: https://github.com/OpenHands/software-agent-sdk/actions/workflows/integration-runner.yml
-3. Click "Run workflow"
-4. Configure:
-   - **Branch**: Select your branch
-   - **model_ids**: `your-model-id`
-   - **Reason**: "Testing model-id"
-5. Wait for completion
-6. **Save run URL** - required for PR description
+Trigger integration tests on your PR branch:
+
+```bash
+gh workflow run integration-runner.yml \
+  -f model_ids=your-model-id \
+  -f reason="Testing new model from PR #<pr-number>" \
+  -f issue_number=<pr-number> \
+  --ref your-branch-name
+```
+
+Results will be posted back to the PR as a comment.
 
 ### Expected Results
 
 - Success rate: 100% (or 87.5% if vision test skipped)
 - Duration: 5-10 minutes per model
 - Tests: 8 total (basic commands, file ops, code editing, reasoning, errors, tools, context, vision)
 
-## Step 7: Create PR
+## Step 8: Fix Issues and Rerun (if needed)
+
+If tests fail, see [Common Issues](#common-issues) below. After fixing:
+
+1. Push the fix: `git add . && git commit && git push`
+2. Rerun integration tests with the same command from Step 7 (using the same PR number)
+
+## Step 9: Mark PR Ready
+
+When tests pass, mark the PR as ready for review:
+
+```bash
+gh pr ready <pr-number>
+```
 
 ### Required in PR Description
 
@@ -379,3 +394,4 @@ Fixes #[issue-number]
 - Recent model additions: #2102, #2153, #2207, #2233, #2269
 - Common issues: #2147 (hangs), #2137 (parameters), #2110 (vision), #2233 (variants), #2193 (preflight)
 - Integration test workflow: `.github/workflows/integration-runner.yml`
+- Integration tests can be triggered via: `gh workflow run integration-runner.yml --ref <branch>`
diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
@@ -242,6 +242,16 @@ def _sigterm_handler(signum: int, _frame: object) -> None:
             "disable_vision": True,
         },
     },
+    "glm-5.1": {
+        "id": "glm-5.1",
+        "display_name": "GLM-5.1",
+        "llm_config": {
+            "model": "litellm_proxy/openrouter/z-ai/glm-5.1",
+            "temperature": 0.0,
+            # OpenRouter glm-5.1 is text-only despite LiteLLM reporting vision support
+            "disable_vision": True,
+        },
+    },
     "qwen3-coder-next": {
         "id": "qwen3-coder-next",
         "display_name": "Qwen3 Coder Next",
diff --git a/.github/workflows/run-eval.yml b/.github/workflows/run-eval.yml
@@ -63,6 +63,11 @@ on:
                 required: false
                 default: main
                 type: string
+            extensions_branch:
+                description: Extensions repo branch to use (for testing feature branches with skills/plugins)
+                required: false
+                default: main
+                type: string
             instance_ids:
                 description: >-
                     Comma-separated instance IDs to evaluate.
@@ -157,6 +162,7 @@ jobs:
                   echo "reason: ${{ github.event.inputs.reason || 'N/A' }}"
                   echo "eval_branch: ${{ github.event.inputs.eval_branch || 'main' }}"
                   echo "benchmarks_branch: ${{ github.event.inputs.benchmarks_branch || 'main' }}"
+                  echo "extensions_branch: ${{ github.event.inputs.extensions_branch || 'main' }}"
                   echo "instance_ids: ${{ github.event.inputs.instance_ids || 'N/A' }}"
                   echo "num_infer_workers: ${{ github.event.inputs.num_infer_workers || '(default)' }}"
                   echo "num_eval_workers: ${{ github.event.inputs.num_eval_workers || '(default)' }}"
@@ -341,6 +347,7 @@ jobs:
                   EVAL_WORKFLOW: ${{ env.EVAL_WORKFLOW }}
                   EVAL_BRANCH: ${{ github.event.inputs.eval_branch || 'main' }}
                   BENCHMARKS_BRANCH: ${{ github.event.inputs.benchmarks_branch || 'main' }}
+                  EXTENSIONS_BRANCH: ${{ github.event.inputs.extensions_branch || 'main' }}
                   BENCHMARK: ${{ github.event.inputs.benchmark || 'swebench' }}
                   TRIGGER_REASON: ${{ github.event.inputs.reason }}
                   PR_NUMBER: ${{ steps.params.outputs.pr_number }}
@@ -357,7 +364,7 @@ jobs:
                   # Normalize instance_ids: strip all spaces
                   INSTANCE_IDS=$(printf '%s' "$INSTANCE_IDS" | tr -d ' ')
 
-                  echo "Dispatching evaluation workflow with SDK commit: $SDK_SHA (benchmark: $BENCHMARK, eval branch: $EVAL_BRANCH, benchmarks branch: $BENCHMARKS_BRANCH, tool preset: $TOOL_PRESET)"
+                  echo "Dispatching evaluation workflow with SDK commit: $SDK_SHA (benchmark: $BENCHMARK, eval branch: $EVAL_BRANCH, benchmarks branch: $BENCHMARKS_BRANCH, extensions branch: $EXTENSIONS_BRANCH, tool preset: $TOOL_PRESET)"
                   PAYLOAD=$(jq -n \
                     --arg sdk "$SDK_SHA" \
                     --arg sdk_run_id "${{ github.run_id }}" \
@@ -367,6 +374,7 @@ jobs:
                     --arg reason "$TRIGGER_REASON" \
                     --arg pr "$PR_NUMBER" \
                     --arg benchmarks "$BENCHMARKS_BRANCH" \
+                    --arg extensions "$EXTENSIONS_BRANCH" \
                     --arg benchmark "$BENCHMARK" \
                     --arg instance_ids "$INSTANCE_IDS" \
                     --arg num_infer_workers "$NUM_INFER_WORKERS" \
@@ -377,7 +385,7 @@ jobs:
                     --arg agent_type "$AGENT_TYPE" \
                     --arg partial_archive_url "$PARTIAL_ARCHIVE_URL" \
                     --arg triggered_by "$TRIGGERED_BY" \
-                    '{ref: $ref, inputs: {sdk_commit: $sdk, sdk_workflow_run_id: $sdk_run_id, eval_limit: $eval_limit, models_json: ($models | tostring), trigger_reason: $reason, pr_number: $pr, benchmarks_branch: $benchmarks, benchmark: $benchmark, instance_ids: $instance_ids, num_infer_workers: $num_infer_workers, num_eval_workers: $num_eval_workers, enable_conversation_event_logging: $enable_conversation_event_logging, max_retries: $max_retries, tool_preset: $tool_preset, agent_type: $agent_type, partial_archive_url: $partial_archive_url, triggered_by: $triggered_by}}')
+                    '{ref: $ref, inputs: {sdk_commit: $sdk, sdk_workflow_run_id: $sdk_run_id, eval_limit: $eval_limit, models_json: ($models | tostring), trigger_reason: $reason, pr_number: $pr, benchmarks_branch: $benchmarks, extensions_branch: $extensions, benchmark: $benchmark, instance_ids: $instance_ids, num_infer_workers: $num_infer_workers, num_eval_workers: $num_eval_workers, enable_conversation_event_logging: $enable_conversation_event_logging, max_retries: $max_retries, tool_preset: $tool_preset, agent_type: $agent_type, partial_archive_url: $partial_archive_url, triggered_by: $triggered_by}}')
                   RESPONSE=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" -X POST \
                     -H "Authorization: token $PAT_TOKEN" \
                     -H "Accept: application/vnd.github+json" \
diff --git a/openhands-agent-server/openhands/agent_server/event_service.py b/openhands-agent-server/openhands/agent_server/event_service.py
@@ -656,7 +656,8 @@ async def close(self):
         await self._pub_sub.close()
         if self._conversation:
             loop = asyncio.get_running_loop()
-            loop.run_in_executor(None, self._conversation.close)
+            await loop.run_in_executor(None, self._conversation.close)
+            self._conversation = None
 
     async def generate_title(
         self, llm: "LLM | None" = None, max_length: int = 50
diff --git a/openhands-sdk/openhands/sdk/context/skills/skill.py b/openhands-sdk/openhands/sdk/context/skills/skill.py
@@ -1,5 +1,6 @@
 import io
 import json
+import os
 import re
 from pathlib import Path
 from typing import Annotated, ClassVar, Literal, Union
@@ -891,7 +892,9 @@ def load_project_skills(work_dir: str | Path) -> list[Skill]:
 
 # Public skills repository configuration
 PUBLIC_SKILLS_REPO = "https://github.com/OpenHands/extensions"
-PUBLIC_SKILLS_BRANCH = "main"
+# Allow overriding the branch via EXTENSIONS_REF environment variable
+# (used by evaluation/benchmarks workflows to test feature branches)
+PUBLIC_SKILLS_BRANCH = os.environ.get("EXTENSIONS_REF", "main")
 DEFAULT_MARKETPLACE_PATH = "marketplaces/default.json"
 
 
diff --git a/tests/agent_server/test_event_service.py b/tests/agent_server/test_event_service.py
@@ -1725,3 +1725,49 @@ def hold_lock_like_run_loop():
             f"for {hold_seconds}s.  The read path is blocked by the write lock "
             f"(see HANG_REPRO.md)."
         )
+
+
+class TestEventServiceClose:
+    """Tests for EventService.close() awaiting conversation teardown."""
+
+    @pytest.mark.asyncio
+    async def test_close_awaits_conversation_close(self, event_service):
+        """close() must await conversation.close(), not fire-and-forget."""
+        conversation = MagicMock(spec=Conversation)
+        event_service._conversation = conversation
+
+        closed = asyncio.Event()
+
+        def slow_close():
+            # Simulate non-trivial teardown work
+            time.sleep(0.05)
+            closed.set()
+
+        conversation.close = slow_close
+
+        await event_service.close()
+
+        assert closed.is_set(), (
+            "EventService.close() returned before conversation.close() finished"
+        )
+
+    @pytest.mark.asyncio
+    async def test_close_clears_conversation_reference(self, event_service):
+        """close() must set _conversation to None after closing."""
+        conversation = MagicMock()
+        event_service._conversation = conversation
+
+        await event_service.close()
+
+        assert event_service._conversation is None
+
+    @pytest.mark.asyncio
+    async def test_close_is_idempotent(self, event_service):
+        """Calling close() twice must not raise."""
+        conversation = MagicMock()
+        event_service._conversation = conversation
+
+        await event_service.close()
+        await event_service.close()  # second call — _conversation is already None
+
+        conversation.close.assert_called_once()
diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py
@@ -268,6 +268,16 @@ def test_glm_5_config():
     assert model["llm_config"]["disable_vision"] is True
 
 
+def test_glm_5_1_config():
+    """Test that glm-5.1 has correct configuration."""
+    model = MODELS["glm-5.1"]
+
+    assert model["id"] == "glm-5.1"
+    assert model["display_name"] == "GLM-5.1"
+    assert model["llm_config"]["model"] == "litellm_proxy/openrouter/z-ai/glm-5.1"
+    assert model["llm_config"]["disable_vision"] is True
+
+
 # Tests for preflight check functionality
 
 
diff --git a/tests/sdk/context/skill/test_extensions_ref.py b/tests/sdk/context/skill/test_extensions_ref.py
@@ -0,0 +1,91 @@
+"""Tests for EXTENSIONS_REF environment variable support.
+
+These tests use subprocess to run each test in an isolated Python process,
+avoiding module state pollution that would affect other tests.
+"""
+
+import subprocess
+import sys
+
+
+def _run_in_subprocess(test_code: str, env_extra: dict | None = None) -> None:
+    """Run test code in a subprocess with the given environment variables."""
+    import os
+
+    env = os.environ.copy()
+    if env_extra:
+        env.update(env_extra)
+
+    result = subprocess.run(
+        [sys.executable, "-c", test_code],
+        env=env,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        raise AssertionError(
+            f"Subprocess test failed:\nstdout: {result.stdout}\nstderr: {result.stderr}"
+        )
+
+
+def test_extensions_ref_default():
+    """PUBLIC_SKILLS_BRANCH should default to 'main' when EXTENSIONS_REF is not set."""
+    code = """
+import os
+if "EXTENSIONS_REF" in os.environ:
+    del os.environ["EXTENSIONS_REF"]
+from openhands.sdk.context.skills.skill import PUBLIC_SKILLS_BRANCH
+assert PUBLIC_SKILLS_BRANCH == "main", (
+    f"Expected 'main' but got '{PUBLIC_SKILLS_BRANCH}'"
+)
+"""
+    _run_in_subprocess(code)
+
+
+def test_extensions_ref_custom_branch():
+    """PUBLIC_SKILLS_BRANCH should use EXTENSIONS_REF when set."""
+    code = """
+from openhands.sdk.context.skills.skill import PUBLIC_SKILLS_BRANCH
+assert PUBLIC_SKILLS_BRANCH == "feature-branch", (
+    f"Expected 'feature-branch' but got '{PUBLIC_SKILLS_BRANCH}'"
+)
+"""
+    _run_in_subprocess(code, {"EXTENSIONS_REF": "feature-branch"})
+
+
+def test_extensions_ref_with_load_public_skills():
+    """load_public_skills should respect EXTENSIONS_REF environment variable."""
+    code = """
+from unittest import mock
+from openhands.sdk.context.skills.skill import (
+    PUBLIC_SKILLS_BRANCH,
+    load_public_skills,
+)
+assert PUBLIC_SKILLS_BRANCH == "test-branch", (
+    f"Expected 'test-branch' but got '{PUBLIC_SKILLS_BRANCH}'"
+)
+with mock.patch(
+    "openhands.sdk.context.skills.skill.update_skills_repository"
+) as mock_update:
+    mock_update.return_value = None
+    load_public_skills()
+    mock_update.assert_called_once()
+    call_args = mock_update.call_args
+    # branch is 2nd positional arg: (repo_url, branch, cache_dir)
+    assert call_args[0][1] == "test-branch", (
+        f"Expected branch='test-branch' but got {call_args[0][1]}"
+    )
+"""
+    _run_in_subprocess(code, {"EXTENSIONS_REF": "test-branch"})
+
+
+def test_extensions_ref_empty_string():
+    """Empty EXTENSIONS_REF should fall back to 'main'."""
+    code = """
+from openhands.sdk.context.skills.skill import PUBLIC_SKILLS_BRANCH
+# Empty string returns empty string per os.environ.get behavior
+assert PUBLIC_SKILLS_BRANCH == "", (
+    f"Expected '' but got '{PUBLIC_SKILLS_BRANCH}'"
+)
+"""
+    _run_in_subprocess(code, {"EXTENSIONS_REF": ""})