OpenHands
diff --git a/‎.agents/skills/code-review.md‎ ‎.agents/skills/custom-codereview-guide.md‎.agents/skills/code-review.md renamed to .agents/skills/custom-codereview-guide.md
Lines changed: 30 additions & 4 deletions b/‎.agents/skills/code-review.md‎ ‎.agents/skills/custom-codereview-guide.md‎.agents/skills/code-review.md renamed to .agents/skills/custom-codereview-guide.md
Lines changed: 30 additions & 4 deletions
diff --git a/‎.github/actions/pr-review/action.yml‎
Lines changed: 5 additions & 1 deletion b/‎.github/actions/pr-review/action.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.github/run-eval/resolve_model_config.py‎
Lines changed: 160 additions & 13 deletions b/‎.github/run-eval/resolve_model_config.py‎
Lines changed: 160 additions & 13 deletions
@@ -1,6 +1,6 @@
 ---
-name: code-review
-description: Structured code review covering style, readability, and security concerns with actionable feedback. Use when reviewing pull requests or merge requests to identify issues and suggest improvements.
+name: custom-codereview-guide
+description: Repo-specific code review guidelines for OpenHands/software-agent-sdk. Provides SDK-specific review rules in addition to the default code review skill.
 triggers:
 - /codereview
 ---
@@ -13,9 +13,27 @@ You are an expert code reviewer for the **OpenHands/software-agent-sdk** reposit
 
 You have permission to **APPROVE** or **COMMENT** on PRs. Do not use REQUEST_CHANGES.
 
-**Default to APPROVE**: If your review finds no issues at "important" level or higher, approve the PR. Minor suggestions or nitpicks alone are not sufficient reason to withhold approval.
+### Review decision policy (eval / benchmark risk)
 
-**IMPORTANT: If you determine a PR is worth merging, you should approve it.** Don’t just say a PR is "worth merging" or "ready to merge" without actually submitting an approval. Your words and actions should be consistent.
+Do **NOT** submit an **APPROVE** review when the PR changes agent behavior or anything
+that could plausibly affect benchmark/evaluation performance.
+
+Examples include: prompt templates, tool calling/execution, planning/loop logic,
+memory/condenser behavior, terminal/stdin/stdout handling, or evaluation harness code.
+
+If a PR is in this category (or you are uncertain), leave a **COMMENT** review and
+explicitly flag it for a human maintainer to decide after running lightweight evals.
+
+### Default approval policy
+
+**Default to APPROVE**: If your review finds no issues at "important" level or higher,
+approve the PR. Minor suggestions or nitpicks alone are not sufficient reason to
+withhold approval.
+
+**IMPORTANT:** If you determine a PR is worth merging **and it is not in the eval-risk
+category above**, you should approve it. Don’t just say a PR is "worth merging" or
+"ready to merge" without actually submitting an approval. Your words and actions should
+be consistent.
 
 ### When to APPROVE
 
@@ -29,6 +47,14 @@ Examples of straightforward and low-risk PRs you should approve (non-exhaustive)
 - **Test-only changes**: Adding or updating tests without changing production code
 - **Dependency updates**: Version bumps with passing CI
 
+### When NOT to APPROVE - Blocking Issues
+
+**DO NOT APPROVE** PRs that have any of the following issues:
+
+- **Package version bumps in non-release PRs**: If any `pyproject.toml` file has changes to the `version` field (e.g., `version = "1.12.0"` → `version = "1.13.0"`), and the PR is NOT explicitly a release PR (title/description doesn't indicate it's a release), **DO NOT APPROVE**. Version numbers should only be changed in dedicated release PRs managed by maintainers.
+  - Check: Look for changes to `version = "..."` in any `*/pyproject.toml` files
+  - Exception: PRs with titles like "release: v1.x.x" or "chore: bump version to 1.x.x" from maintainers
+
 Examples:
 - A PR adding a new model to `resolve_model_config.py` or `verified_models.py` with corresponding test updates
 - A PR adding documentation notes to docstrings clarifying method behavior (e.g., security considerations, bypass behaviors)
 
@@ -67,10 +67,14 @@ runs:
           with:
               python-version: '3.12'
 
+        # Security: this workflow executes untrusted PR content (diff/title/body) via an
+        # LLM-powered reviewer agent that can run Bash. GitHub Actions caches are shared
+        # across workflows within a repository and can enable cache-poisoning pivots into
+        # more-privileged workflows. Keep caching disabled here.
         - name: Install uv
           uses: astral-sh/setup-uv@v6
           with:
-              enable-cache: true
+              enable-cache: false
 
         - name: Install GitHub CLI
           shell: bash
 
@@ -1,9 +1,12 @@
 #!/usr/bin/env python3
 """
-Resolve model IDs to full model configurations.
+Resolve model IDs to full model configurations and verify model availability.
 
 Reads:
 - MODEL_IDS: comma-separated model IDs
+- LLM_API_KEY: API key for litellm_proxy (optional, for preflight check)
+- LLM_BASE_URL: Base URL for litellm_proxy (optional, defaults to eval proxy)
+- SKIP_PREFLIGHT: Set to 'true' to skip the preflight LLM check
 
 Outputs to GITHUB_OUTPUT:
 - models_json: JSON array of full model configs with display names
@@ -12,6 +15,13 @@
 import json
 import os
 import sys
+from typing import Any
+
+
+# SDK-specific parameters that should not be passed to litellm.
+# These parameters are used by the SDK's LLM wrapper but are not part of litellm's API.
+# Keep this list in sync with SDK LLM config parameters that are SDK-internal.
+SDK_ONLY_PARAMS = {"disable_vision"}
 
 
 # Model configurations dictionary
@@ -48,6 +58,14 @@
             "litellm_extra_body": {"enable_thinking": True},
         },
     },
+    "qwen3.5-flash": {
+        "id": "qwen3.5-flash",
+        "display_name": "Qwen3.5 Flash",
+        "llm_config": {
+            "model": "litellm_proxy/dashscope/qwen3.5-flash-2026-02-23",
+            "temperature": 0.0,
+        },
+    },
     "claude-4.5-opus": {
         "id": "claude-4.5-opus",
         "display_name": "Claude 4.5 Opus",
@@ -64,6 +82,14 @@
             "temperature": 0.0,
         },
     },
+    "claude-sonnet-4-6": {
+        "id": "claude-sonnet-4-6",
+        "display_name": "Claude Sonnet 4.6",
+        "llm_config": {
+            "model": "litellm_proxy/anthropic/claude-sonnet-4-6",
+            "temperature": 0.0,
+        },
+    },
     "gemini-3-pro": {
         "id": "gemini-3-pro",
         "display_name": "Gemini 3 Pro",
@@ -74,6 +100,11 @@
         "display_name": "Gemini 3 Flash",
         "llm_config": {"model": "litellm_proxy/gemini-3-flash-preview"},
     },
+    "gemini-3.1-pro": {
+        "id": "gemini-3.1-pro",
+        "display_name": "Gemini 3.1 Pro",
+        "llm_config": {"model": "litellm_proxy/gemini-3.1-pro-preview"},
+    },
     "gpt-5.2": {
         "id": "gpt-5.2",
         "display_name": "GPT-5.2",
@@ -100,22 +131,17 @@
     "minimax-m2.5": {
         "id": "minimax-m2.5",
         "display_name": "MiniMax M2.5",
-        "llm_config": {"model": "litellm_proxy/minimax/MiniMax-M2.5"},
+        "llm_config": {
+            "model": "litellm_proxy/minimax/MiniMax-M2.5",
+            "temperature": 1.0,
+            "top_p": 0.95,
+        },
     },
     "minimax-m2.1": {
         "id": "minimax-m2.1",
         "display_name": "MiniMax M2.1",
         "llm_config": {"model": "litellm_proxy/minimax/MiniMax-M2.1"},
     },
-    "jade-spark-2862": {
-        "id": "jade-spark-2862",
-        "display_name": "Jade Spark 2862",
-        "llm_config": {
-            "model": "litellm_proxy/jade-spark-2862",
-            "temperature": 1.0,
-            "top_p": 0.95,
-        },
-    },
     "deepseek-v3.2-reasoner": {
         "id": "deepseek-v3.2-reasoner",
         "display_name": "DeepSeek V3.2 Reasoner",
@@ -150,6 +176,8 @@
         "display_name": "GLM-5",
         "llm_config": {
             "model": "litellm_proxy/openrouter/z-ai/glm-5",
+            # OpenRouter glm-5 is text-only despite LiteLLM reporting vision support
+            "disable_vision": True,
         },
     },
     "qwen3-coder-next": {
@@ -207,6 +235,122 @@ def find_models_by_id(model_ids: list[str]) -> list[dict]:
     return resolved
 
 
+def check_model(
+    model_config: dict[str, Any],
+    api_key: str,
+    base_url: str,
+    timeout: int = 60,
+) -> tuple[bool, str]:
+    """Check a single model with a simple completion request using litellm.
+
+    Args:
+        model_config: Model configuration dict with 'llm_config' key
+        api_key: API key for authentication
+        base_url: Base URL for the LLM proxy
+        timeout: Request timeout in seconds
+
+    Returns:
+        Tuple of (success: bool, message: str)
+    """
+    import litellm
+
+    llm_config = model_config.get("llm_config", {})
+    model_name = llm_config.get("model", "unknown")
+    display_name = model_config.get("display_name", model_name)
+
+    try:
+        # Build kwargs from llm_config, excluding 'model' and SDK-specific params
+        kwargs = {
+            k: v
+            for k, v in llm_config.items()
+            if k != "model" and k not in SDK_ONLY_PARAMS
+        }
+
+        # Use simple arithmetic prompt that works reliably across all models
+        # max_tokens=100 provides enough room for models to respond
+        # (some need >10 tokens)
+        response = litellm.completion(
+            model=model_name,
+            messages=[{"role": "user", "content": "1+1="}],
+            max_tokens=100,
+            api_key=api_key,
+            base_url=base_url,
+            timeout=timeout,
+            **kwargs,
+        )
+
+        content = response.choices[0].message.content if response.choices else None
+
+        if content:
+            return True, f"✓ {display_name}: OK"
+        else:
+            # Check if there's any other data in the response for diagnostics
+            finish_reason = (
+                response.choices[0].finish_reason if response.choices else None
+            )
+            usage = getattr(response, "usage", None)
+            return (
+                False,
+                (
+                    f"✗ {display_name}: Empty response "
+                    f"(finish_reason={finish_reason}, usage={usage})"
+                ),
+            )
+
+    except litellm.exceptions.Timeout:
+        return False, f"✗ {display_name}: Request timed out after {timeout}s"
+    except litellm.exceptions.APIConnectionError as e:
+        return False, f"✗ {display_name}: Connection error - {e}"
+    except litellm.exceptions.BadRequestError as e:
+        return False, f"✗ {display_name}: Bad request - {e}"
+    except litellm.exceptions.NotFoundError as e:
+        return False, f"✗ {display_name}: Model not found - {e}"
+    except Exception as e:
+        return False, f"✗ {display_name}: {type(e).__name__} - {e}"
+
+
+def run_preflight_check(models: list[dict[str, Any]]) -> bool:
+    """Run preflight LLM check for all models.
+
+    Args:
+        models: List of model configurations to test
+
+    Returns:
+        True if all models passed, False otherwise
+    """
+    api_key = os.environ.get("LLM_API_KEY")
+    base_url = os.environ.get("LLM_BASE_URL", "https://llm-proxy.eval.all-hands.dev")
+    skip_preflight = os.environ.get("SKIP_PREFLIGHT", "").lower() == "true"
+
+    if skip_preflight:
+        print("Preflight check: SKIPPED (SKIP_PREFLIGHT=true)")
+        return True
+
+    if not api_key:
+        print("Preflight check: SKIPPED (LLM_API_KEY not set)")
+        return True
+
+    print(f"\nPreflight LLM check for {len(models)} model(s)...")
+    print("-" * 50)
+
+    all_passed = True
+    for model_config in models:
+        success, message = check_model(model_config, api_key, base_url)
+        print(message)
+        if not success:
+            all_passed = False
+
+    print("-" * 50)
+
+    if all_passed:
+        print(f"✓ All {len(models)} model(s) passed preflight check\n")
+    else:
+        print("✗ Some models failed preflight check")
+        print("Evaluation aborted to avoid wasting compute resources.\n")
+
+    return all_passed
+
+
 def main() -> None:
     model_ids_str = get_required_env("MODEL_IDS")
     github_output = get_required_env("GITHUB_OUTPUT")
@@ -216,14 +360,17 @@ def main() -> None:
 
     # Resolve model configs
     resolved = find_models_by_id(model_ids)
+    print(f"Resolved {len(resolved)} model(s): {', '.join(model_ids)}")
+
+    # Run preflight check
+    if not run_preflight_check(resolved):
+        error_exit("Preflight LLM check failed")
 
     # Output as JSON
     models_json = json.dumps(resolved, separators=(",", ":"))
     with open(github_output, "a", encoding="utf-8") as f:
         f.write(f"models_json={models_json}\n")
 
-    print(f"Resolved {len(resolved)} model(s): {', '.join(model_ids)}")
-
 
 if __name__ == "__main__":
     main()