OpenHands
diff --git a/‎.agents/skills/custom-codereview-guide.md‎
Lines changed: 20 additions & 2 deletions b/‎.agents/skills/custom-codereview-guide.md‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎.github/run-eval/resolve_model_config.py‎
Lines changed: 34 additions & 9 deletions b/‎.github/run-eval/resolve_model_config.py‎
Lines changed: 34 additions & 9 deletions
@@ -13,9 +13,27 @@ You are an expert code reviewer for the **OpenHands/software-agent-sdk** reposit
 
 You have permission to **APPROVE** or **COMMENT** on PRs. Do not use REQUEST_CHANGES.
 
-**Default to APPROVE**: If your review finds no issues at "important" level or higher, approve the PR. Minor suggestions or nitpicks alone are not sufficient reason to withhold approval.
+### Review decision policy (eval / benchmark risk)
 
-**IMPORTANT: If you determine a PR is worth merging, you should approve it.** Don’t just say a PR is "worth merging" or "ready to merge" without actually submitting an approval. Your words and actions should be consistent.
+Do **NOT** submit an **APPROVE** review when the PR changes agent behavior or anything
+that could plausibly affect benchmark/evaluation performance.
+
+Examples include: prompt templates, tool calling/execution, planning/loop logic,
+memory/condenser behavior, terminal/stdin/stdout handling, or evaluation harness code.
+
+If a PR is in this category (or you are uncertain), leave a **COMMENT** review and
+explicitly flag it for a human maintainer to decide after running lightweight evals.
+
+### Default approval policy
+
+**Default to APPROVE**: If your review finds no issues at "important" level or higher,
+approve the PR. Minor suggestions or nitpicks alone are not sufficient reason to
+withhold approval.
+
+**IMPORTANT:** If you determine a PR is worth merging **and it is not in the eval-risk
+category above**, you should approve it. Don’t just say a PR is "worth merging" or
+"ready to merge" without actually submitting an approval. Your words and actions should
+be consistent.
 
 ### When to APPROVE
 
 
@@ -18,6 +18,12 @@
 from typing import Any
 
 
+# SDK-specific parameters that should not be passed to litellm.
+# These parameters are used by the SDK's LLM wrapper but are not part of litellm's API.
+# Keep this list in sync with SDK LLM config parameters that are SDK-internal.
+SDK_ONLY_PARAMS = {"disable_vision"}
+
+
 # Model configurations dictionary
 MODELS = {
     "claude-sonnet-4-5-20250929": {
@@ -229,13 +235,13 @@ def find_models_by_id(model_ids: list[str]) -> list[dict]:
     return resolved
 
 
-def test_model(
+def check_model(
     model_config: dict[str, Any],
     api_key: str,
     base_url: str,
     timeout: int = 60,
 ) -> tuple[bool, str]:
-    """Test a single model with a simple completion request using litellm.
+    """Check a single model with a simple completion request using litellm.
 
     Args:
         model_config: Model configuration dict with 'llm_config' key
@@ -253,24 +259,43 @@ def test_model(
     display_name = model_config.get("display_name", model_name)
 
     try:
-        # Build kwargs from llm_config, excluding 'model' which is passed separately
-        kwargs = {k: v for k, v in llm_config.items() if k != "model"}
-
+        # Build kwargs from llm_config, excluding 'model' and SDK-specific params
+        kwargs = {
+            k: v
+            for k, v in llm_config.items()
+            if k != "model" and k not in SDK_ONLY_PARAMS
+        }
+
+        # Use simple arithmetic prompt that works reliably across all models
+        # max_tokens=100 provides enough room for models to respond
+        # (some need >10 tokens)
         response = litellm.completion(
             model=model_name,
-            messages=[{"role": "user", "content": "Say 'OK' if you can read this."}],
-            max_tokens=10,
+            messages=[{"role": "user", "content": "1+1="}],
+            max_tokens=100,
             api_key=api_key,
             base_url=base_url,
             timeout=timeout,
             **kwargs,
         )
 
         content = response.choices[0].message.content if response.choices else None
+
         if content:
             return True, f"✓ {display_name}: OK"
         else:
-            return False, f"✗ {display_name}: Empty response"
+            # Check if there's any other data in the response for diagnostics
+            finish_reason = (
+                response.choices[0].finish_reason if response.choices else None
+            )
+            usage = getattr(response, "usage", None)
+            return (
+                False,
+                (
+                    f"✗ {display_name}: Empty response "
+                    f"(finish_reason={finish_reason}, usage={usage})"
+                ),
+            )
 
     except litellm.exceptions.Timeout:
         return False, f"✗ {display_name}: Request timed out after {timeout}s"
@@ -310,7 +335,7 @@ def run_preflight_check(models: list[dict[str, Any]]) -> bool:
 
     all_passed = True
     for model_config in models:
-        success, message = test_model(model_config, api_key, base_url)
+        success, message = check_model(model_config, api_key, base_url)
         print(message)
         if not success:
             all_passed = False