Skip to content

Commit 0112183

Browse files
authored
Merge branch 'main' into feat/tools-j2-descriptions
2 parents d53235d + 2b54375 commit 0112183

File tree

36 files changed

+2029
-91
lines changed

36 files changed

+2029
-91
lines changed

.agents/skills/custom-codereview-guide.md

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,27 @@ You are an expert code reviewer for the **OpenHands/software-agent-sdk** reposit
1313

1414
You have permission to **APPROVE** or **COMMENT** on PRs. Do not use REQUEST_CHANGES.
1515

16-
**Default to APPROVE**: If your review finds no issues at "important" level or higher, approve the PR. Minor suggestions or nitpicks alone are not sufficient reason to withhold approval.
16+
### Review decision policy (eval / benchmark risk)
1717

18-
**IMPORTANT: If you determine a PR is worth merging, you should approve it.** Don’t just say a PR is "worth merging" or "ready to merge" without actually submitting an approval. Your words and actions should be consistent.
18+
Do **NOT** submit an **APPROVE** review when the PR changes agent behavior or anything
19+
that could plausibly affect benchmark/evaluation performance.
20+
21+
Examples include: prompt templates, tool calling/execution, planning/loop logic,
22+
memory/condenser behavior, terminal/stdin/stdout handling, or evaluation harness code.
23+
24+
If a PR is in this category (or you are uncertain), leave a **COMMENT** review and
25+
explicitly flag it for a human maintainer to decide after running lightweight evals.
26+
27+
### Default approval policy
28+
29+
**Default to APPROVE**: If your review finds no issues at "important" level or higher,
30+
approve the PR. Minor suggestions or nitpicks alone are not sufficient reason to
31+
withhold approval.
32+
33+
**IMPORTANT:** If you determine a PR is worth merging **and it is not in the eval-risk
34+
category above**, you should approve it. Don’t just say a PR is "worth merging" or
35+
"ready to merge" without actually submitting an approval. Your words and actions should
36+
be consistent.
1937

2038
### When to APPROVE
2139

.github/run-eval/resolve_model_config.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
from typing import Any
1919

2020

21+
# SDK-specific parameters that should not be passed to litellm.
22+
# These parameters are used by the SDK's LLM wrapper but are not part of litellm's API.
23+
# Keep this list in sync with SDK LLM config parameters that are SDK-internal.
24+
SDK_ONLY_PARAMS = {"disable_vision"}
25+
26+
2127
# Model configurations dictionary
2228
MODELS = {
2329
"claude-sonnet-4-5-20250929": {
@@ -229,13 +235,13 @@ def find_models_by_id(model_ids: list[str]) -> list[dict]:
229235
return resolved
230236

231237

232-
def test_model(
238+
def check_model(
233239
model_config: dict[str, Any],
234240
api_key: str,
235241
base_url: str,
236242
timeout: int = 60,
237243
) -> tuple[bool, str]:
238-
"""Test a single model with a simple completion request using litellm.
244+
"""Check a single model with a simple completion request using litellm.
239245
240246
Args:
241247
model_config: Model configuration dict with 'llm_config' key
@@ -253,24 +259,43 @@ def test_model(
253259
display_name = model_config.get("display_name", model_name)
254260

255261
try:
256-
# Build kwargs from llm_config, excluding 'model' which is passed separately
257-
kwargs = {k: v for k, v in llm_config.items() if k != "model"}
258-
262+
# Build kwargs from llm_config, excluding 'model' and SDK-specific params
263+
kwargs = {
264+
k: v
265+
for k, v in llm_config.items()
266+
if k != "model" and k not in SDK_ONLY_PARAMS
267+
}
268+
269+
# Use simple arithmetic prompt that works reliably across all models
270+
# max_tokens=100 provides enough room for models to respond
271+
# (some need >10 tokens)
259272
response = litellm.completion(
260273
model=model_name,
261-
messages=[{"role": "user", "content": "Say 'OK' if you can read this."}],
262-
max_tokens=10,
274+
messages=[{"role": "user", "content": "1+1="}],
275+
max_tokens=100,
263276
api_key=api_key,
264277
base_url=base_url,
265278
timeout=timeout,
266279
**kwargs,
267280
)
268281

269282
content = response.choices[0].message.content if response.choices else None
283+
270284
if content:
271285
return True, f"✓ {display_name}: OK"
272286
else:
273-
return False, f"✗ {display_name}: Empty response"
287+
# Check if there's any other data in the response for diagnostics
288+
finish_reason = (
289+
response.choices[0].finish_reason if response.choices else None
290+
)
291+
usage = getattr(response, "usage", None)
292+
return (
293+
False,
294+
(
295+
f"✗ {display_name}: Empty response "
296+
f"(finish_reason={finish_reason}, usage={usage})"
297+
),
298+
)
274299

275300
except litellm.exceptions.Timeout:
276301
return False, f"✗ {display_name}: Request timed out after {timeout}s"
@@ -310,7 +335,7 @@ def run_preflight_check(models: list[dict[str, Any]]) -> bool:
310335

311336
all_passed = True
312337
for model_config in models:
313-
success, message = test_model(model_config, api_key, base_url)
338+
success, message = check_model(model_config, api_key, base_url)
314339
print(message)
315340
if not success:
316341
all_passed = False

0 commit comments

Comments
 (0)