Skip to content

Commit a691cda

Browse files
VascoSch92openhands-agentall-hands-bot
authored
fix(tools): merge subagents metrics (DelegateTool) (#2221)
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: OpenHands Bot <contact@all-hands.dev>
1 parent 731809d commit a691cda

File tree

2 files changed

+187
-4
lines changed

2 files changed

+187
-4
lines changed

openhands-tools/openhands/tools/delegate/impl.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,17 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
117117
parent_visualizer = parent_conversation._visualizer
118118
workspace_path = parent_conversation.state.workspace.working_dir
119119

120-
# Disable streaming for sub-agents since they run in
121-
# separate threads without token callbacks
122-
sub_agent_llm = parent_llm.model_copy(update={"stream": False})
123-
124120
resolved_agent_types = [
125121
self._resolve_agent_type(action, i) for i in range(len(action.ids))
126122
]
127123

128124
for agent_id, agent_type in zip(action.ids, resolved_agent_types):
125+
# Each sub-agent gets its own LLM copy with independent metrics.
126+
# model_copy() shallow-copies private attrs, so reset_metrics()
127+
# is needed to break the shared Metrics reference with the parent.
128+
sub_agent_llm = parent_llm.model_copy(update={"stream": False})
129+
sub_agent_llm.reset_metrics()
130+
129131
factory = get_agent_factory(name=agent_type)
130132
worker_agent = factory.factory_func(sub_agent_llm)
131133

@@ -260,6 +262,17 @@ def run_task(
260262
for thread in threads:
261263
thread.join()
262264

265+
# Sync sub-agent metrics into parent conversation.
266+
# Sub-agent metrics are cumulative, so replace (not merge)
267+
# to avoid double-counting on repeated delegations.
268+
parent_stats = parent_conversation.conversation_stats
269+
for agent_id in action.tasks:
270+
if agent_id in self._sub_agents:
271+
sub_conv = self._sub_agents[agent_id]
272+
parent_stats.usage_to_metrics[f"delegate:{agent_id}"] = (
273+
sub_conv.conversation_stats.get_combined_metrics()
274+
)
275+
263276
# Collect results in the same order as the input tasks
264277
all_results = []
265278

tests/tools/delegate/test_delegation.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pydantic import SecretStr
88

99
from openhands.sdk.agent.utils import fix_malformed_tool_arguments
10+
from openhands.sdk.conversation.conversation_stats import ConversationStats
1011
from openhands.sdk.conversation.state import ConversationExecutionStatus
1112
from openhands.sdk.llm import LLM, TextContent
1213
from openhands.sdk.subagent.registry import register_builtins_agents
@@ -213,6 +214,175 @@ def test_spawn_disables_streaming_for_sub_agents():
213214
assert parent_llm.stream is True, "Parent LLM should still have streaming enabled"
214215

215216

217+
def test_spawn_gives_sub_agents_independent_metrics():
218+
"""Sub-agents must not share the parent's Metrics object."""
219+
register_builtins_agents()
220+
parent_llm = LLM(
221+
model="openai/gpt-4o",
222+
api_key=SecretStr("test-key"),
223+
base_url="https://api.openai.com/v1",
224+
)
225+
226+
parent_conversation = MagicMock()
227+
parent_conversation.id = uuid.uuid4()
228+
parent_conversation.agent.llm = parent_llm
229+
parent_conversation.state.workspace.working_dir = "/tmp"
230+
parent_conversation._visualizer = None
231+
232+
executor = DelegateExecutor()
233+
spawn_action = DelegateAction(command="spawn", ids=["a1", "a2"])
234+
executor(spawn_action, parent_conversation)
235+
236+
a1_llm = executor._sub_agents["a1"].agent.llm
237+
a2_llm = executor._sub_agents["a2"].agent.llm
238+
239+
# Each sub-agent must have its own Metrics, not the parent's
240+
assert a1_llm.metrics is not parent_llm.metrics
241+
assert a2_llm.metrics is not parent_llm.metrics
242+
assert a1_llm.metrics is not a2_llm.metrics
243+
244+
# Mutating a sub-agent's metrics must not affect the parent
245+
before = parent_llm.metrics.accumulated_cost
246+
a1_llm.metrics.add_cost(1.00)
247+
assert parent_llm.metrics.accumulated_cost == before
248+
a2_llm.metrics.add_cost(1.00)
249+
assert parent_llm.metrics.accumulated_cost == before
250+
251+
252+
def test_delegate_merges_metrics_into_parent():
253+
"""After delegation, sub-agent metrics appear in parent stats."""
254+
register_builtins_agents()
255+
parent_llm = LLM(
256+
model="openai/gpt-4o",
257+
api_key=SecretStr("test-key"),
258+
base_url="https://api.openai.com/v1",
259+
)
260+
parent_stats = ConversationStats()
261+
parent_stats.usage_to_metrics["agent"] = parent_llm.metrics
262+
263+
parent_conversation = MagicMock()
264+
parent_conversation.id = uuid.uuid4()
265+
parent_conversation.agent.llm = parent_llm
266+
parent_conversation.state.workspace.working_dir = "/tmp"
267+
parent_conversation._visualizer = None
268+
parent_conversation.conversation_stats = parent_stats
269+
270+
executor = DelegateExecutor()
271+
spawn_action = DelegateAction(command="spawn", ids=["a1", "a2"])
272+
executor(spawn_action, parent_conversation)
273+
274+
# Wire LLMs into sub-conv stats (simulates what _ensure_agent_ready does)
275+
for agent_id in ("a1", "a2"):
276+
sub_conv = executor._sub_agents[agent_id]
277+
llm = sub_conv.agent.llm
278+
sub_conv.conversation_stats.usage_to_metrics[llm.usage_id] = llm.metrics
279+
280+
# Simulate sub-agent LLM usage
281+
a1_llm = executor._sub_agents["a1"].agent.llm
282+
a2_llm = executor._sub_agents["a2"].agent.llm
283+
a1_llm.metrics.add_cost(1.00)
284+
a1_llm.metrics.add_token_usage(
285+
prompt_tokens=100,
286+
completion_tokens=50,
287+
cache_read_tokens=0,
288+
cache_write_tokens=0,
289+
context_window=128000,
290+
response_id="a1_r1",
291+
)
292+
a2_llm.metrics.add_cost(2.00)
293+
a2_llm.metrics.add_token_usage(
294+
prompt_tokens=200,
295+
completion_tokens=100,
296+
cache_read_tokens=0,
297+
cache_write_tokens=0,
298+
context_window=128000,
299+
response_id="a2_r1",
300+
)
301+
302+
# Run delegation (patching send_message/run so no real LLM calls happen)
303+
with (
304+
patch.object(executor._sub_agents["a1"], "send_message"),
305+
patch.object(executor._sub_agents["a1"], "run"),
306+
patch.object(executor._sub_agents["a2"], "send_message"),
307+
patch.object(executor._sub_agents["a2"], "run"),
308+
):
309+
delegate_action = DelegateAction(
310+
command="delegate",
311+
tasks={"a1": "task 1", "a2": "task 2"},
312+
)
313+
executor(delegate_action, parent_conversation)
314+
315+
# Sub-agent metrics are now in parent stats under delegate: keys
316+
assert "delegate:a1" in parent_stats.usage_to_metrics
317+
assert "delegate:a2" in parent_stats.usage_to_metrics
318+
assert parent_stats.usage_to_metrics["delegate:a1"].accumulated_cost == 1.00
319+
assert parent_stats.usage_to_metrics["delegate:a2"].accumulated_cost == 2.00
320+
321+
# Combined total includes parent + both sub-agents
322+
combined = parent_stats.get_combined_metrics()
323+
assert combined.accumulated_cost == 3.00
324+
accumulated_token_usage = combined.accumulated_token_usage
325+
assert accumulated_token_usage is not None
326+
assert accumulated_token_usage.prompt_tokens == 300
327+
assert accumulated_token_usage.completion_tokens == 150
328+
329+
330+
def test_repeated_delegation_does_not_double_count():
331+
"""Delegating to the same agent twice must not duplicate metrics."""
332+
register_builtins_agents()
333+
parent_llm = LLM(
334+
model="openai/gpt-4o",
335+
api_key=SecretStr("test-key"),
336+
base_url="https://api.openai.com/v1",
337+
)
338+
parent_stats = ConversationStats()
339+
parent_stats.usage_to_metrics["agent"] = parent_llm.metrics
340+
341+
parent_conversation = MagicMock()
342+
parent_conversation.id = uuid.uuid4()
343+
parent_conversation.agent.llm = parent_llm
344+
parent_conversation.state.workspace.working_dir = "/tmp"
345+
parent_conversation._visualizer = None
346+
parent_conversation.conversation_stats = parent_stats
347+
348+
executor = DelegateExecutor()
349+
spawn_action = DelegateAction(command="spawn", ids=["a1"])
350+
executor(spawn_action, parent_conversation)
351+
352+
sub_conv = executor._sub_agents["a1"]
353+
sub_conv.conversation_stats.usage_to_metrics[sub_conv.agent.llm.usage_id] = (
354+
sub_conv.agent.llm.metrics
355+
)
356+
357+
a1_llm = executor._sub_agents["a1"].agent.llm
358+
359+
# First delegation: sub-agent accumulates $1.00
360+
a1_llm.metrics.add_cost(1.00)
361+
with (
362+
patch.object(executor._sub_agents["a1"], "send_message"),
363+
patch.object(executor._sub_agents["a1"], "run"),
364+
):
365+
executor(
366+
DelegateAction(command="delegate", tasks={"a1": "first task"}),
367+
parent_conversation,
368+
)
369+
assert parent_stats.usage_to_metrics["delegate:a1"].accumulated_cost == 1.00
370+
371+
# Second delegation: sub-agent accumulates another $2.00 (cumulative $3.00)
372+
a1_llm.metrics.add_cost(2.00)
373+
with (
374+
patch.object(executor._sub_agents["a1"], "send_message"),
375+
patch.object(executor._sub_agents["a1"], "run"),
376+
):
377+
executor(
378+
DelegateAction(command="delegate", tasks={"a1": "second task"}),
379+
parent_conversation,
380+
)
381+
382+
# Must be $3.00 (cumulative), not $4.00 (double-counted)
383+
assert parent_stats.usage_to_metrics["delegate:a1"].accumulated_cost == 3.00
384+
385+
216386
def test_issue_2216():
217387
"""Reproduce issue #2216: DelegateAction rejects tasks sent as a JSON string.
218388

0 commit comments

Comments
 (0)