Skip to content

Commit d66d74d

Browse files
authored
Merge branch 'main' into fix-duplicate-tool-result-error
2 parents 029ebc4 + 9f521a4 commit d66d74d

File tree

32 files changed

+848
-253
lines changed

32 files changed

+848
-253
lines changed

.agents/skills/run-eval.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ curl -X POST \
3232

3333
**Key parameters:**
3434
- `benchmark`: `swebench`, `swebenchmultimodal`, `gaia`, `swtbench`, `commit0`, `multiswebench`
35-
- `eval_limit`: `1`, `50`, `100`, `200`, `500`
35+
- `eval_limit`: Any positive integer (e.g., `1`, `10`, `50`, `200`)
3636
- `model_ids`: See `.github/run-eval/resolve_model_config.py` for available models
3737
- `benchmarks_branch`: Use feature branch from the benchmarks repo to test benchmark changes before merging
3838

.github/run-eval/resolve_model_config.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@
3737
"kimi-k2-thinking": {
3838
"id": "kimi-k2-thinking",
3939
"display_name": "Kimi K2 Thinking",
40-
"llm_config": {"model": "litellm_proxy/moonshot/kimi-k2-thinking"},
40+
"llm_config": {
41+
"model": "litellm_proxy/moonshot/kimi-k2-thinking",
42+
"temperature": 1.0,
43+
},
4144
},
4245
# https://www.kimi.com/blog/kimi-k2-5.html
4346
"kimi-k2.5": {
@@ -93,17 +96,26 @@
9396
"gemini-3-pro": {
9497
"id": "gemini-3-pro",
9598
"display_name": "Gemini 3 Pro",
96-
"llm_config": {"model": "litellm_proxy/gemini-3-pro-preview"},
99+
"llm_config": {
100+
"model": "litellm_proxy/gemini-3-pro-preview",
101+
"temperature": 0.0,
102+
},
97103
},
98104
"gemini-3-flash": {
99105
"id": "gemini-3-flash",
100106
"display_name": "Gemini 3 Flash",
101-
"llm_config": {"model": "litellm_proxy/gemini-3-flash-preview"},
107+
"llm_config": {
108+
"model": "litellm_proxy/gemini-3-flash-preview",
109+
"temperature": 0.0,
110+
},
102111
},
103112
"gemini-3.1-pro": {
104113
"id": "gemini-3.1-pro",
105114
"display_name": "Gemini 3.1 Pro",
106-
"llm_config": {"model": "litellm_proxy/gemini-3.1-pro-preview"},
115+
"llm_config": {
116+
"model": "litellm_proxy/gemini-3.1-pro-preview",
117+
"temperature": 0.0,
118+
},
107119
},
108120
"gpt-5.2": {
109121
"id": "gpt-5.2",
@@ -126,7 +138,10 @@
126138
"minimax-m2": {
127139
"id": "minimax-m2",
128140
"display_name": "MiniMax M2",
129-
"llm_config": {"model": "litellm_proxy/minimax/minimax-m2"},
141+
"llm_config": {
142+
"model": "litellm_proxy/minimax/minimax-m2",
143+
"temperature": 0.0,
144+
},
130145
},
131146
"minimax-m2.5": {
132147
"id": "minimax-m2.5",
@@ -140,7 +155,10 @@
140155
"minimax-m2.1": {
141156
"id": "minimax-m2.1",
142157
"display_name": "MiniMax M2.1",
143-
"llm_config": {"model": "litellm_proxy/minimax/MiniMax-M2.1"},
158+
"llm_config": {
159+
"model": "litellm_proxy/minimax/MiniMax-M2.1",
160+
"temperature": 0.0,
161+
},
144162
},
145163
"deepseek-v3.2-reasoner": {
146164
"id": "deepseek-v3.2-reasoner",
@@ -151,7 +169,8 @@
151169
"id": "qwen-3-coder",
152170
"display_name": "Qwen 3 Coder",
153171
"llm_config": {
154-
"model": "litellm_proxy/fireworks_ai/qwen3-coder-480b-a35b-instruct"
172+
"model": "litellm_proxy/fireworks_ai/qwen3-coder-480b-a35b-instruct",
173+
"temperature": 0.0,
155174
},
156175
},
157176
"nemotron-3-nano-30b": {
@@ -167,6 +186,7 @@
167186
"display_name": "GLM-4.7",
168187
"llm_config": {
169188
"model": "litellm_proxy/openrouter/z-ai/glm-4.7",
189+
"temperature": 0.0,
170190
# OpenRouter glm-4.7 is text-only despite LiteLLM reporting vision support
171191
"disable_vision": True,
172192
},
@@ -176,24 +196,34 @@
176196
"display_name": "GLM-5",
177197
"llm_config": {
178198
"model": "litellm_proxy/openrouter/z-ai/glm-5",
199+
"temperature": 0.0,
179200
# OpenRouter glm-5 is text-only despite LiteLLM reporting vision support
180201
"disable_vision": True,
181202
},
182203
},
183204
"qwen3-coder-next": {
184205
"id": "qwen3-coder-next",
185206
"display_name": "Qwen3 Coder Next",
186-
"llm_config": {"model": "litellm_proxy/openrouter/qwen/qwen3-coder-next"},
207+
"llm_config": {
208+
"model": "litellm_proxy/openrouter/qwen/qwen3-coder-next",
209+
"temperature": 0.0,
210+
},
187211
},
188212
"qwen3-coder-30b-a3b-instruct": {
189213
"id": "qwen3-coder-30b-a3b-instruct",
190214
"display_name": "Qwen3 Coder 30B A3B Instruct",
191-
"llm_config": {"model": "litellm_proxy/Qwen3-Coder-30B-A3B-Instruct"},
215+
"llm_config": {
216+
"model": "litellm_proxy/Qwen3-Coder-30B-A3B-Instruct",
217+
"temperature": 0.0,
218+
},
192219
},
193220
"gpt-oss-20b": {
194221
"id": "gpt-oss-20b",
195222
"display_name": "GPT OSS 20B",
196-
"llm_config": {"model": "litellm_proxy/gpt-oss-20b"},
223+
"llm_config": {
224+
"model": "litellm_proxy/gpt-oss-20b",
225+
"temperature": 0.0,
226+
},
197227
},
198228
}
199229

.github/workflows/run-eval.yml

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,10 @@ on:
3232
default: false
3333
type: boolean
3434
eval_limit:
35-
description: Number of instances to run
35+
description: Number of instances to run (any positive integer)
3636
required: false
3737
default: '1'
38-
type: choice
39-
options:
40-
- '1'
41-
- '100'
42-
- '50'
43-
- '200'
44-
- '500'
38+
type: string
4539
model_ids:
4640
description: Comma-separated model IDs to evaluate. Must be keys of MODELS in resolve_model_config.py. Defaults to first model in that
4741
dict.
@@ -138,6 +132,14 @@ jobs:
138132
with:
139133
python-version: '3.13'
140134

135+
- name: Validate eval_limit
136+
if: github.event_name == 'workflow_dispatch'
137+
run: |
138+
if ! [[ "${{ github.event.inputs.eval_limit }}" =~ ^[1-9][0-9]*$ ]]; then
139+
echo "Error: eval_limit must be a positive integer, got: ${{ github.event.inputs.eval_limit }}"
140+
exit 1
141+
fi
142+
141143
- name: Validate SDK reference (semantic version check)
142144
if: github.event_name == 'workflow_dispatch'
143145
env:

examples/01_standalone_sdk/25_agent_delegation.py

Lines changed: 56 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99

1010
import os
1111

12-
from pydantic import SecretStr
13-
1412
from openhands.sdk import (
1513
LLM,
1614
Agent,
@@ -26,30 +24,26 @@
2624
DelegateTool,
2725
DelegationVisualizer,
2826
)
29-
from openhands.tools.preset.default import get_default_tools
27+
from openhands.tools.preset.default import get_default_tools, register_builtins_agents
3028

3129

3230
ONLY_RUN_SIMPLE_DELEGATION = False
3331

3432
logger = get_logger(__name__)
3533

3634
# Configure LLM and agent
37-
# You can get an API key from https://app.all-hands.dev/settings/api-keys
38-
api_key = os.getenv("LLM_API_KEY")
39-
assert api_key is not None, "LLM_API_KEY environment variable is not set."
40-
model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929")
4135
llm = LLM(
42-
model=model,
43-
api_key=SecretStr(api_key),
36+
model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"),
37+
api_key=os.getenv("LLM_API_KEY"),
4438
base_url=os.environ.get("LLM_BASE_URL", None),
4539
usage_id="agent",
4640
)
4741

4842
cwd = os.getcwd()
4943

50-
register_tool("DelegateTool", DelegateTool)
51-
tools = get_default_tools(enable_browser=False)
52-
tools.append(Tool(name="DelegateTool"))
44+
tools = get_default_tools(enable_browser=True)
45+
tools.append(Tool(name=DelegateTool.name))
46+
register_builtins_agents()
5347

5448
main_agent = Agent(
5549
llm=llm,
@@ -61,7 +55,7 @@
6155
visualizer=DelegationVisualizer(name="Delegator"),
6256
)
6357

64-
task_message = (
58+
conversation.send_message(
6559
"Forget about coding. Let's switch to travel planning. "
6660
"Let's plan a trip to London. I have two issues I need to solve: "
6761
"Lodging: what are the best areas to stay at while keeping budget in mind? "
@@ -72,7 +66,6 @@
7266
"They should keep it short. After getting the results, merge both analyses "
7367
"into a single consolidated report.\n\n"
7468
)
75-
conversation.send_message(task_message)
7669
conversation.run()
7770

7871
conversation.send_message(
@@ -81,18 +74,57 @@
8174
conversation.run()
8275

8376
# Report cost for simple delegation example
84-
cost_1 = conversation.conversation_stats.get_combined_metrics().accumulated_cost
85-
print(f"EXAMPLE_COST (simple delegation): {cost_1}")
77+
cost_simple = conversation.conversation_stats.get_combined_metrics().accumulated_cost
78+
print(f"EXAMPLE_COST (simple delegation): {cost_simple}")
8679

8780
print("Simple delegation example done!", "\n" * 20)
8881

89-
90-
# -------- Agent Delegation Second Part: User-Defined Agent Types --------
91-
9282
if ONLY_RUN_SIMPLE_DELEGATION:
83+
# For CI: always emit the EXAMPLE_COST marker before exiting.
84+
print(f"EXAMPLE_COST: {cost_simple}")
9385
exit(0)
9486

9587

88+
# -------- Agent Delegation Second Part: Built-in Agent Types (Explore + Bash) --------
89+
90+
main_agent = Agent(
91+
llm=llm,
92+
tools=[Tool(name=DelegateTool.name)],
93+
)
94+
conversation = Conversation(
95+
agent=main_agent,
96+
workspace=cwd,
97+
visualizer=DelegationVisualizer(name="Delegator (builtins)"),
98+
)
99+
100+
builtin_task_message = (
101+
"Demonstrate SDK built-in sub-agent types. "
102+
"1) Spawn an 'explore' sub-agent and ask it to list the markdown files in "
103+
"openhands-sdk/openhands/sdk/subagent/builtins/ and summarize what each "
104+
"built-in agent type is for (based on the file contents). "
105+
"2) Spawn a 'bash' sub-agent and ask it to run `python --version` in the "
106+
"terminal and return the exact output. "
107+
"3) Merge both results into a short report. "
108+
"Do not use internet access."
109+
)
110+
111+
print("=" * 100)
112+
print("Demonstrating built-in agent delegation (explore + bash)...")
113+
print("=" * 100)
114+
115+
conversation.send_message(builtin_task_message)
116+
conversation.run()
117+
118+
# Report cost for builtin agent types example
119+
cost_builtin = conversation.conversation_stats.get_combined_metrics().accumulated_cost
120+
print(f"EXAMPLE_COST (builtin agents): {cost_builtin}")
121+
122+
print("Built-in agent delegation example done!", "\n" * 20)
123+
124+
125+
# -------- Agent Delegation Third Part: User-Defined Agent Types --------
126+
127+
96128
def create_lodging_planner(llm: LLM) -> Agent:
97129
"""Create a lodging planner focused on London stays."""
98130
skills = [
@@ -190,10 +222,12 @@ def create_activities_planner(llm: LLM) -> Agent:
190222
conversation.run()
191223

192224
# Report cost for user-defined agent types example
193-
cost_2 = conversation.conversation_stats.get_combined_metrics().accumulated_cost
194-
print(f"EXAMPLE_COST (user-defined agents): {cost_2}")
225+
cost_user_defined = (
226+
conversation.conversation_stats.get_combined_metrics().accumulated_cost
227+
)
228+
print(f"EXAMPLE_COST (user-defined agents): {cost_user_defined}")
195229

196230
print("All done!")
197231

198232
# Full example cost report for CI workflow
199-
print(f"EXAMPLE_COST: {cost_1 + cost_2}")
233+
print(f"EXAMPLE_COST: {cost_simple + cost_builtin + cost_user_defined}")

openhands-agent-server/openhands/agent_server/git_router.py

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,74 @@
44
import logging
55
from pathlib import Path
66

7-
from fastapi import APIRouter
7+
from fastapi import APIRouter, Query
88

99
from openhands.agent_server.server_details_router import update_last_execution_time
1010
from openhands.sdk.git.git_changes import get_git_changes
1111
from openhands.sdk.git.git_diff import get_git_diff
1212
from openhands.sdk.git.models import GitChange, GitDiff
13+
from openhands.sdk.utils.deprecation import deprecated
1314

1415

1516
git_router = APIRouter(prefix="/git", tags=["Git"])
1617
logger = logging.getLogger(__name__)
1718

1819

19-
@git_router.get("/changes/{path:path}")
20-
async def git_changes(
21-
path: Path,
22-
) -> list[GitChange]:
20+
async def _get_git_changes(path: str) -> list[GitChange]:
21+
"""Internal helper to get git changes for a given path."""
2322
update_last_execution_time()
2423
loop = asyncio.get_running_loop()
25-
changes = await loop.run_in_executor(None, get_git_changes, path)
26-
return changes
24+
return await loop.run_in_executor(None, get_git_changes, Path(path))
2725

2826

29-
@git_router.get("/diff/{path:path}")
30-
async def git_diff(
31-
path: Path,
32-
) -> GitDiff:
27+
async def _get_git_diff(path: str) -> GitDiff:
28+
"""Internal helper to get git diff for a given path."""
3329
update_last_execution_time()
3430
loop = asyncio.get_running_loop()
35-
changes = await loop.run_in_executor(None, get_git_diff, path)
36-
return changes
31+
return await loop.run_in_executor(None, get_git_diff, Path(path))
32+
33+
34+
@git_router.get("/changes")
35+
async def git_changes_query(
36+
path: str = Query(..., description="The git repository path"),
37+
) -> list[GitChange]:
38+
"""Get git changes using query parameter (preferred method)."""
39+
return await _get_git_changes(path)
40+
41+
42+
@git_router.get("/changes/{path:path}")
43+
@deprecated(
44+
deprecated_in="1.15.0",
45+
removed_in="1.20.0",
46+
details=(
47+
"Use the /git/changes endpoint with a query parameter for the path "
48+
"instead of a path parameter. This allows for better handling of "
49+
"complex paths and is more consistent with other endpoints."
50+
),
51+
)
52+
async def git_changes_path(path: str) -> list[GitChange]:
53+
"""Get git changes using path parameter (legacy, for backwards compatibility)."""
54+
return await _get_git_changes(path)
55+
56+
57+
@git_router.get("/diff")
58+
async def git_diff_query(
59+
path: str = Query(..., description="The file path to get diff for"),
60+
) -> GitDiff:
61+
"""Get git diff using query parameter (preferred method)."""
62+
return await _get_git_diff(path)
63+
64+
65+
@git_router.get("/diff/{path:path}")
66+
@deprecated(
67+
deprecated_in="1.15.0",
68+
removed_in="1.20.0",
69+
details=(
70+
"Use the /git/diff endpoint with a query parameter for the path "
71+
"instead of a path parameter. This allows for better handling of "
72+
"complex paths and is more consistent with other endpoints."
73+
),
74+
)
75+
async def git_diff_path(path: str) -> GitDiff:
76+
"""Get git diff using path parameter (legacy, for backwards compatibility)."""
77+
return await _get_git_diff(path)

0 commit comments

Comments
 (0)