OpenHands
diff --git a/‎.agents/skills/custom-codereview-guide.md‎
Lines changed: 20 additions & 2 deletions b/‎.agents/skills/custom-codereview-guide.md‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎.github/run-eval/resolve_model_config.py‎
Lines changed: 40 additions & 10 deletions b/‎.github/run-eval/resolve_model_config.py‎
Lines changed: 40 additions & 10 deletions
@@ -13,9 +13,27 @@ You are an expert code reviewer for the **OpenHands/software-agent-sdk** reposit
 
 You have permission to **APPROVE** or **COMMENT** on PRs. Do not use REQUEST_CHANGES.
 
-**Default to APPROVE**: If your review finds no issues at "important" level or higher, approve the PR. Minor suggestions or nitpicks alone are not sufficient reason to withhold approval.
+### Review decision policy (eval / benchmark risk)
 
-**IMPORTANT: If you determine a PR is worth merging, you should approve it.** Don’t just say a PR is "worth merging" or "ready to merge" without actually submitting an approval. Your words and actions should be consistent.
+Do **NOT** submit an **APPROVE** review when the PR changes agent behavior or anything
+that could plausibly affect benchmark/evaluation performance.
+
+Examples include: prompt templates, tool calling/execution, planning/loop logic,
+memory/condenser behavior, terminal/stdin/stdout handling, or evaluation harness code.
+
+If a PR is in this category (or you are uncertain), leave a **COMMENT** review and
+explicitly flag it for a human maintainer to decide after running lightweight evals.
+
+### Default approval policy
+
+**Default to APPROVE**: If your review finds no issues at "important" level or higher,
+approve the PR. Minor suggestions or nitpicks alone are not sufficient reason to
+withhold approval.
+
+**IMPORTANT:** If you determine a PR is worth merging **and it is not in the eval-risk
+category above**, you should approve it. Don’t just say a PR is "worth merging" or
+"ready to merge" without actually submitting an approval. Your words and actions should
+be consistent.
 
 ### When to APPROVE
 
 
@@ -37,7 +37,10 @@
     "kimi-k2-thinking": {
         "id": "kimi-k2-thinking",
         "display_name": "Kimi K2 Thinking",
-        "llm_config": {"model": "litellm_proxy/moonshot/kimi-k2-thinking"},
+        "llm_config": {
+            "model": "litellm_proxy/moonshot/kimi-k2-thinking",
+            "temperature": 1.0,
+        },
     },
     # https://www.kimi.com/blog/kimi-k2-5.html
     "kimi-k2.5": {
@@ -93,17 +96,26 @@
     "gemini-3-pro": {
         "id": "gemini-3-pro",
         "display_name": "Gemini 3 Pro",
-        "llm_config": {"model": "litellm_proxy/gemini-3-pro-preview"},
+        "llm_config": {
+            "model": "litellm_proxy/gemini-3-pro-preview",
+            "temperature": 0.0,
+        },
     },
     "gemini-3-flash": {
         "id": "gemini-3-flash",
         "display_name": "Gemini 3 Flash",
-        "llm_config": {"model": "litellm_proxy/gemini-3-flash-preview"},
+        "llm_config": {
+            "model": "litellm_proxy/gemini-3-flash-preview",
+            "temperature": 0.0,
+        },
     },
     "gemini-3.1-pro": {
         "id": "gemini-3.1-pro",
         "display_name": "Gemini 3.1 Pro",
-        "llm_config": {"model": "litellm_proxy/gemini-3.1-pro-preview"},
+        "llm_config": {
+            "model": "litellm_proxy/gemini-3.1-pro-preview",
+            "temperature": 0.0,
+        },
     },
     "gpt-5.2": {
         "id": "gpt-5.2",
@@ -126,7 +138,10 @@
     "minimax-m2": {
         "id": "minimax-m2",
         "display_name": "MiniMax M2",
-        "llm_config": {"model": "litellm_proxy/minimax/minimax-m2"},
+        "llm_config": {
+            "model": "litellm_proxy/minimax/minimax-m2",
+            "temperature": 0.0,
+        },
     },
     "minimax-m2.5": {
         "id": "minimax-m2.5",
@@ -140,7 +155,10 @@
     "minimax-m2.1": {
         "id": "minimax-m2.1",
         "display_name": "MiniMax M2.1",
-        "llm_config": {"model": "litellm_proxy/minimax/MiniMax-M2.1"},
+        "llm_config": {
+            "model": "litellm_proxy/minimax/MiniMax-M2.1",
+            "temperature": 0.0,
+        },
     },
     "deepseek-v3.2-reasoner": {
         "id": "deepseek-v3.2-reasoner",
@@ -151,7 +169,8 @@
         "id": "qwen-3-coder",
         "display_name": "Qwen 3 Coder",
         "llm_config": {
-            "model": "litellm_proxy/fireworks_ai/qwen3-coder-480b-a35b-instruct"
+            "model": "litellm_proxy/fireworks_ai/qwen3-coder-480b-a35b-instruct",
+            "temperature": 0.0,
         },
     },
     "nemotron-3-nano-30b": {
@@ -167,6 +186,7 @@
         "display_name": "GLM-4.7",
         "llm_config": {
             "model": "litellm_proxy/openrouter/z-ai/glm-4.7",
+            "temperature": 0.0,
             # OpenRouter glm-4.7 is text-only despite LiteLLM reporting vision support
             "disable_vision": True,
         },
@@ -176,24 +196,34 @@
         "display_name": "GLM-5",
         "llm_config": {
             "model": "litellm_proxy/openrouter/z-ai/glm-5",
+            "temperature": 0.0,
             # OpenRouter glm-5 is text-only despite LiteLLM reporting vision support
             "disable_vision": True,
         },
     },
     "qwen3-coder-next": {
         "id": "qwen3-coder-next",
         "display_name": "Qwen3 Coder Next",
-        "llm_config": {"model": "litellm_proxy/openrouter/qwen/qwen3-coder-next"},
+        "llm_config": {
+            "model": "litellm_proxy/openrouter/qwen/qwen3-coder-next",
+            "temperature": 0.0,
+        },
     },
     "qwen3-coder-30b-a3b-instruct": {
         "id": "qwen3-coder-30b-a3b-instruct",
         "display_name": "Qwen3 Coder 30B A3B Instruct",
-        "llm_config": {"model": "litellm_proxy/Qwen3-Coder-30B-A3B-Instruct"},
+        "llm_config": {
+            "model": "litellm_proxy/Qwen3-Coder-30B-A3B-Instruct",
+            "temperature": 0.0,
+        },
     },
     "gpt-oss-20b": {
         "id": "gpt-oss-20b",
         "display_name": "GPT OSS 20B",
-        "llm_config": {"model": "litellm_proxy/gpt-oss-20b"},
+        "llm_config": {
+            "model": "litellm_proxy/gpt-oss-20b",
+            "temperature": 0.0,
+        },
     },
 }