|
7 | 7 | from pydantic import SecretStr |
8 | 8 |
|
9 | 9 | from openhands.sdk.agent.utils import fix_malformed_tool_arguments |
| 10 | +from openhands.sdk.conversation.conversation_stats import ConversationStats |
10 | 11 | from openhands.sdk.conversation.state import ConversationExecutionStatus |
11 | 12 | from openhands.sdk.llm import LLM, TextContent |
12 | 13 | from openhands.sdk.subagent.registry import register_builtins_agents |
@@ -213,6 +214,175 @@ def test_spawn_disables_streaming_for_sub_agents(): |
213 | 214 | assert parent_llm.stream is True, "Parent LLM should still have streaming enabled" |
214 | 215 |
|
215 | 216 |
|
| 217 | +def test_spawn_gives_sub_agents_independent_metrics(): |
| 218 | + """Sub-agents must not share the parent's Metrics object.""" |
| 219 | + register_builtins_agents() |
| 220 | + parent_llm = LLM( |
| 221 | + model="openai/gpt-4o", |
| 222 | + api_key=SecretStr("test-key"), |
| 223 | + base_url="https://api.openai.com/v1", |
| 224 | + ) |
| 225 | + |
| 226 | + parent_conversation = MagicMock() |
| 227 | + parent_conversation.id = uuid.uuid4() |
| 228 | + parent_conversation.agent.llm = parent_llm |
| 229 | + parent_conversation.state.workspace.working_dir = "/tmp" |
| 230 | + parent_conversation._visualizer = None |
| 231 | + |
| 232 | + executor = DelegateExecutor() |
| 233 | + spawn_action = DelegateAction(command="spawn", ids=["a1", "a2"]) |
| 234 | + executor(spawn_action, parent_conversation) |
| 235 | + |
| 236 | + a1_llm = executor._sub_agents["a1"].agent.llm |
| 237 | + a2_llm = executor._sub_agents["a2"].agent.llm |
| 238 | + |
| 239 | + # Each sub-agent must have its own Metrics, not the parent's |
| 240 | + assert a1_llm.metrics is not parent_llm.metrics |
| 241 | + assert a2_llm.metrics is not parent_llm.metrics |
| 242 | + assert a1_llm.metrics is not a2_llm.metrics |
| 243 | + |
| 244 | + # Mutating a sub-agent's metrics must not affect the parent |
| 245 | + before = parent_llm.metrics.accumulated_cost |
| 246 | + a1_llm.metrics.add_cost(1.00) |
| 247 | + assert parent_llm.metrics.accumulated_cost == before |
| 248 | + a2_llm.metrics.add_cost(1.00) |
| 249 | + assert parent_llm.metrics.accumulated_cost == before |
| 250 | + |
| 251 | + |
| 252 | +def test_delegate_merges_metrics_into_parent(): |
| 253 | + """After delegation, sub-agent metrics appear in parent stats.""" |
| 254 | + register_builtins_agents() |
| 255 | + parent_llm = LLM( |
| 256 | + model="openai/gpt-4o", |
| 257 | + api_key=SecretStr("test-key"), |
| 258 | + base_url="https://api.openai.com/v1", |
| 259 | + ) |
| 260 | + parent_stats = ConversationStats() |
| 261 | + parent_stats.usage_to_metrics["agent"] = parent_llm.metrics |
| 262 | + |
| 263 | + parent_conversation = MagicMock() |
| 264 | + parent_conversation.id = uuid.uuid4() |
| 265 | + parent_conversation.agent.llm = parent_llm |
| 266 | + parent_conversation.state.workspace.working_dir = "/tmp" |
| 267 | + parent_conversation._visualizer = None |
| 268 | + parent_conversation.conversation_stats = parent_stats |
| 269 | + |
| 270 | + executor = DelegateExecutor() |
| 271 | + spawn_action = DelegateAction(command="spawn", ids=["a1", "a2"]) |
| 272 | + executor(spawn_action, parent_conversation) |
| 273 | + |
| 274 | + # Wire LLMs into sub-conv stats (simulates what _ensure_agent_ready does) |
| 275 | + for agent_id in ("a1", "a2"): |
| 276 | + sub_conv = executor._sub_agents[agent_id] |
| 277 | + llm = sub_conv.agent.llm |
| 278 | + sub_conv.conversation_stats.usage_to_metrics[llm.usage_id] = llm.metrics |
| 279 | + |
| 280 | + # Simulate sub-agent LLM usage |
| 281 | + a1_llm = executor._sub_agents["a1"].agent.llm |
| 282 | + a2_llm = executor._sub_agents["a2"].agent.llm |
| 283 | + a1_llm.metrics.add_cost(1.00) |
| 284 | + a1_llm.metrics.add_token_usage( |
| 285 | + prompt_tokens=100, |
| 286 | + completion_tokens=50, |
| 287 | + cache_read_tokens=0, |
| 288 | + cache_write_tokens=0, |
| 289 | + context_window=128000, |
| 290 | + response_id="a1_r1", |
| 291 | + ) |
| 292 | + a2_llm.metrics.add_cost(2.00) |
| 293 | + a2_llm.metrics.add_token_usage( |
| 294 | + prompt_tokens=200, |
| 295 | + completion_tokens=100, |
| 296 | + cache_read_tokens=0, |
| 297 | + cache_write_tokens=0, |
| 298 | + context_window=128000, |
| 299 | + response_id="a2_r1", |
| 300 | + ) |
| 301 | + |
| 302 | + # Run delegation (patching send_message/run so no real LLM calls happen) |
| 303 | + with ( |
| 304 | + patch.object(executor._sub_agents["a1"], "send_message"), |
| 305 | + patch.object(executor._sub_agents["a1"], "run"), |
| 306 | + patch.object(executor._sub_agents["a2"], "send_message"), |
| 307 | + patch.object(executor._sub_agents["a2"], "run"), |
| 308 | + ): |
| 309 | + delegate_action = DelegateAction( |
| 310 | + command="delegate", |
| 311 | + tasks={"a1": "task 1", "a2": "task 2"}, |
| 312 | + ) |
| 313 | + executor(delegate_action, parent_conversation) |
| 314 | + |
| 315 | + # Sub-agent metrics are now in parent stats under delegate: keys |
| 316 | + assert "delegate:a1" in parent_stats.usage_to_metrics |
| 317 | + assert "delegate:a2" in parent_stats.usage_to_metrics |
| 318 | + assert parent_stats.usage_to_metrics["delegate:a1"].accumulated_cost == 1.00 |
| 319 | + assert parent_stats.usage_to_metrics["delegate:a2"].accumulated_cost == 2.00 |
| 320 | + |
| 321 | + # Combined total includes parent + both sub-agents |
| 322 | + combined = parent_stats.get_combined_metrics() |
| 323 | + assert combined.accumulated_cost == 3.00 |
| 324 | + accumulated_token_usage = combined.accumulated_token_usage |
| 325 | + assert accumulated_token_usage is not None |
| 326 | + assert accumulated_token_usage.prompt_tokens == 300 |
| 327 | + assert accumulated_token_usage.completion_tokens == 150 |
| 328 | + |
| 329 | + |
| 330 | +def test_repeated_delegation_does_not_double_count(): |
| 331 | + """Delegating to the same agent twice must not duplicate metrics.""" |
| 332 | + register_builtins_agents() |
| 333 | + parent_llm = LLM( |
| 334 | + model="openai/gpt-4o", |
| 335 | + api_key=SecretStr("test-key"), |
| 336 | + base_url="https://api.openai.com/v1", |
| 337 | + ) |
| 338 | + parent_stats = ConversationStats() |
| 339 | + parent_stats.usage_to_metrics["agent"] = parent_llm.metrics |
| 340 | + |
| 341 | + parent_conversation = MagicMock() |
| 342 | + parent_conversation.id = uuid.uuid4() |
| 343 | + parent_conversation.agent.llm = parent_llm |
| 344 | + parent_conversation.state.workspace.working_dir = "/tmp" |
| 345 | + parent_conversation._visualizer = None |
| 346 | + parent_conversation.conversation_stats = parent_stats |
| 347 | + |
| 348 | + executor = DelegateExecutor() |
| 349 | + spawn_action = DelegateAction(command="spawn", ids=["a1"]) |
| 350 | + executor(spawn_action, parent_conversation) |
| 351 | + |
| 352 | + sub_conv = executor._sub_agents["a1"] |
| 353 | + sub_conv.conversation_stats.usage_to_metrics[sub_conv.agent.llm.usage_id] = ( |
| 354 | + sub_conv.agent.llm.metrics |
| 355 | + ) |
| 356 | + |
| 357 | + a1_llm = executor._sub_agents["a1"].agent.llm |
| 358 | + |
| 359 | + # First delegation: sub-agent accumulates $1.00 |
| 360 | + a1_llm.metrics.add_cost(1.00) |
| 361 | + with ( |
| 362 | + patch.object(executor._sub_agents["a1"], "send_message"), |
| 363 | + patch.object(executor._sub_agents["a1"], "run"), |
| 364 | + ): |
| 365 | + executor( |
| 366 | + DelegateAction(command="delegate", tasks={"a1": "first task"}), |
| 367 | + parent_conversation, |
| 368 | + ) |
| 369 | + assert parent_stats.usage_to_metrics["delegate:a1"].accumulated_cost == 1.00 |
| 370 | + |
| 371 | + # Second delegation: sub-agent accumulates another $2.00 (cumulative $3.00) |
| 372 | + a1_llm.metrics.add_cost(2.00) |
| 373 | + with ( |
| 374 | + patch.object(executor._sub_agents["a1"], "send_message"), |
| 375 | + patch.object(executor._sub_agents["a1"], "run"), |
| 376 | + ): |
| 377 | + executor( |
| 378 | + DelegateAction(command="delegate", tasks={"a1": "second task"}), |
| 379 | + parent_conversation, |
| 380 | + ) |
| 381 | + |
| 382 | + # Must be $3.00 (cumulative), not $4.00 (double-counted) |
| 383 | + assert parent_stats.usage_to_metrics["delegate:a1"].accumulated_cost == 3.00 |
| 384 | + |
| 385 | + |
216 | 386 | def test_issue_2216(): |
217 | 387 | """Reproduce issue #2216: DelegateAction rejects tasks sent as a JSON string. |
218 | 388 |
|
|
0 commit comments