Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions openhands-tools/openhands/tools/delegate/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,17 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
parent_visualizer = parent_conversation._visualizer
workspace_path = parent_conversation.state.workspace.working_dir

# Disable streaming for sub-agents since they run in
# separate threads without token callbacks
sub_agent_llm = parent_llm.model_copy(update={"stream": False})

resolved_agent_types = [
self._resolve_agent_type(action, i) for i in range(len(action.ids))
]

for agent_id, agent_type in zip(action.ids, resolved_agent_types):
# Each sub-agent gets its own LLM copy with independent metrics.
# model_copy() shallow-copies private attrs, so reset_metrics()
# is needed to break the shared Metrics reference with the parent.
sub_agent_llm = parent_llm.model_copy(update={"stream": False})
sub_agent_llm.reset_metrics()

factory = get_agent_factory(name=agent_type)
worker_agent = factory.factory_func(sub_agent_llm)

Expand Down Expand Up @@ -260,6 +262,17 @@ def run_task(
for thread in threads:
thread.join()

# Sync sub-agent metrics into parent conversation.
# Sub-agent metrics are cumulative, so replace (not merge)
# to avoid double-counting on repeated delegations.
parent_stats = parent_conversation.conversation_stats
for agent_id in action.tasks:
if agent_id in self._sub_agents:
sub_conv = self._sub_agents[agent_id]
parent_stats.usage_to_metrics[f"delegate:{agent_id}"] = (
sub_conv.conversation_stats.get_combined_metrics()
)

# Collect results in the same order as the input tasks
all_results = []

Expand Down
170 changes: 170 additions & 0 deletions tests/tools/delegate/test_delegation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pydantic import SecretStr

from openhands.sdk.agent.utils import fix_malformed_tool_arguments
from openhands.sdk.conversation.conversation_stats import ConversationStats
from openhands.sdk.conversation.state import ConversationExecutionStatus
from openhands.sdk.llm import LLM, TextContent
from openhands.sdk.subagent.registry import register_builtins_agents
Expand Down Expand Up @@ -213,6 +214,175 @@ def test_spawn_disables_streaming_for_sub_agents():
assert parent_llm.stream is True, "Parent LLM should still have streaming enabled"


def test_spawn_gives_sub_agents_independent_metrics():
"""Sub-agents must not share the parent's Metrics object."""
register_builtins_agents()
parent_llm = LLM(
model="openai/gpt-4o",
api_key=SecretStr("test-key"),
base_url="https://api.openai.com/v1",
)

parent_conversation = MagicMock()
parent_conversation.id = uuid.uuid4()
parent_conversation.agent.llm = parent_llm
parent_conversation.state.workspace.working_dir = "/tmp"
parent_conversation._visualizer = None

executor = DelegateExecutor()
spawn_action = DelegateAction(command="spawn", ids=["a1", "a2"])
executor(spawn_action, parent_conversation)

a1_llm = executor._sub_agents["a1"].agent.llm
a2_llm = executor._sub_agents["a2"].agent.llm

# Each sub-agent must have its own Metrics, not the parent's
assert a1_llm.metrics is not parent_llm.metrics
assert a2_llm.metrics is not parent_llm.metrics
assert a1_llm.metrics is not a2_llm.metrics

# Mutating a sub-agent's metrics must not affect the parent
before = parent_llm.metrics.accumulated_cost
a1_llm.metrics.add_cost(1.00)
assert parent_llm.metrics.accumulated_cost == before
a2_llm.metrics.add_cost(1.00)
assert parent_llm.metrics.accumulated_cost == before


def test_delegate_merges_metrics_into_parent():
"""After delegation, sub-agent metrics appear in parent stats."""
register_builtins_agents()
parent_llm = LLM(
model="openai/gpt-4o",
api_key=SecretStr("test-key"),
base_url="https://api.openai.com/v1",
)
parent_stats = ConversationStats()
parent_stats.usage_to_metrics["agent"] = parent_llm.metrics

parent_conversation = MagicMock()
parent_conversation.id = uuid.uuid4()
parent_conversation.agent.llm = parent_llm
parent_conversation.state.workspace.working_dir = "/tmp"
parent_conversation._visualizer = None
parent_conversation.conversation_stats = parent_stats

executor = DelegateExecutor()
spawn_action = DelegateAction(command="spawn", ids=["a1", "a2"])
executor(spawn_action, parent_conversation)

# Wire LLMs into sub-conv stats (simulates what _ensure_agent_ready does)
for agent_id in ("a1", "a2"):
sub_conv = executor._sub_agents[agent_id]
llm = sub_conv.agent.llm
sub_conv.conversation_stats.usage_to_metrics[llm.usage_id] = llm.metrics

# Simulate sub-agent LLM usage
a1_llm = executor._sub_agents["a1"].agent.llm
a2_llm = executor._sub_agents["a2"].agent.llm
a1_llm.metrics.add_cost(1.00)
a1_llm.metrics.add_token_usage(
prompt_tokens=100,
completion_tokens=50,
cache_read_tokens=0,
cache_write_tokens=0,
context_window=128000,
response_id="a1_r1",
)
a2_llm.metrics.add_cost(2.00)
a2_llm.metrics.add_token_usage(
prompt_tokens=200,
completion_tokens=100,
cache_read_tokens=0,
cache_write_tokens=0,
context_window=128000,
response_id="a2_r1",
)

# Run delegation (patching send_message/run so no real LLM calls happen)
with (
patch.object(executor._sub_agents["a1"], "send_message"),
patch.object(executor._sub_agents["a1"], "run"),
patch.object(executor._sub_agents["a2"], "send_message"),
patch.object(executor._sub_agents["a2"], "run"),
):
delegate_action = DelegateAction(
command="delegate",
tasks={"a1": "task 1", "a2": "task 2"},
)
executor(delegate_action, parent_conversation)

# Sub-agent metrics are now in parent stats under delegate: keys
assert "delegate:a1" in parent_stats.usage_to_metrics
assert "delegate:a2" in parent_stats.usage_to_metrics
assert parent_stats.usage_to_metrics["delegate:a1"].accumulated_cost == 1.00
assert parent_stats.usage_to_metrics["delegate:a2"].accumulated_cost == 2.00

# Combined total includes parent + both sub-agents
combined = parent_stats.get_combined_metrics()
assert combined.accumulated_cost == 3.00
accumulated_token_usage = combined.accumulated_token_usage
assert accumulated_token_usage is not None
assert accumulated_token_usage.prompt_tokens == 300
assert accumulated_token_usage.completion_tokens == 150


def test_repeated_delegation_does_not_double_count():
"""Delegating to the same agent twice must not duplicate metrics."""
register_builtins_agents()
parent_llm = LLM(
model="openai/gpt-4o",
api_key=SecretStr("test-key"),
base_url="https://api.openai.com/v1",
)
parent_stats = ConversationStats()
parent_stats.usage_to_metrics["agent"] = parent_llm.metrics

parent_conversation = MagicMock()
parent_conversation.id = uuid.uuid4()
parent_conversation.agent.llm = parent_llm
parent_conversation.state.workspace.working_dir = "/tmp"
parent_conversation._visualizer = None
parent_conversation.conversation_stats = parent_stats

executor = DelegateExecutor()
spawn_action = DelegateAction(command="spawn", ids=["a1"])
executor(spawn_action, parent_conversation)

sub_conv = executor._sub_agents["a1"]
sub_conv.conversation_stats.usage_to_metrics[sub_conv.agent.llm.usage_id] = (
sub_conv.agent.llm.metrics
)

a1_llm = executor._sub_agents["a1"].agent.llm

# First delegation: sub-agent accumulates $1.00
a1_llm.metrics.add_cost(1.00)
with (
patch.object(executor._sub_agents["a1"], "send_message"),
patch.object(executor._sub_agents["a1"], "run"),
):
executor(
DelegateAction(command="delegate", tasks={"a1": "first task"}),
parent_conversation,
)
assert parent_stats.usage_to_metrics["delegate:a1"].accumulated_cost == 1.00

# Second delegation: sub-agent accumulates another $2.00 (cumulative $3.00)
a1_llm.metrics.add_cost(2.00)
with (
patch.object(executor._sub_agents["a1"], "send_message"),
patch.object(executor._sub_agents["a1"], "run"),
):
executor(
DelegateAction(command="delegate", tasks={"a1": "second task"}),
parent_conversation,
)

# Must be $3.00 (cumulative), not $4.00 (double-counted)
assert parent_stats.usage_to_metrics["delegate:a1"].accumulated_cost == 3.00


def test_issue_2216():
"""Reproduce issue #2216: DelegateAction rejects tasks sent as a JSON string.

Expand Down
Loading