refactor: only successful tool invocations are counted towards tool_calls usage metric

tradeqvest · tradeqvest · commit 68918cbdeb14 · 2025-08-28T22:29:43.000+03:00
- Adjusted the tool call counting mechanism to ensure that only successful tool invocations are counted towards the `tool_calls` metric.
- Updated documentation in `tools.md` to clarify that output tools do not increment the `tool_calls` count.
- Modified multiple test cases to reflect the correct counting of tool calls, including tests for failed tool calls and their impact on usage metrics.
diff --git a/docs/tools.md b/docs/tools.md
@@ -730,7 +730,7 @@ When a model returns multiple tool calls in one response, Pydantic AI schedules
 Async functions are run on the event loop, while sync functions are offloaded to threads. To get the best performance, _always_ use an async function _unless_ you're doing blocking I/O (and there's no way to use a non-blocking library instead) or CPU-bound work (like `numpy` or `scikit-learn` operations), so that simple functions are not offloaded to threads unnecessarily.
 
 !!! note "Limiting exact tool executions"
-    You can cap the exact number of tool executions within a run using `UsageLimits(tool_calls_limit=...)`. The counter increments immediately before each actual tool invocation (after successful argument validation), and concurrent calls are counted safely.
+    You can cap the exact number of tool executions within a run using `UsageLimits(tool_calls_limit=...)`. The counter increments after each successful tool invocation. Note that output tools (used for structured output) are not counted in the `tool_calls` metric.
 
 ## Third-Party Tools
 
diff --git a/pydantic_ai_slim/pydantic_ai/_tool_manager.py b/pydantic_ai_slim/pydantic_ai/_tool_manager.py
@@ -122,9 +122,13 @@ async def _call_tool(self, call: ToolCallPart, allow_partial: bool, wrap_validat
 
             if tool.tool_def.kind != 'output' and self.ctx.usage_limits is not None:
                 self.ctx.usage_limits.check_before_tool_call(self.ctx.usage)
+
+            result = await self.toolset.call_tool(name, args_dict, ctx, tool)
+
+            if tool.tool_def.kind != 'output':
                 self.ctx.usage.tool_calls += 1
 
-            return await self.toolset.call_tool(name, args_dict, ctx, tool)
+            return result
         except (ValidationError, ModelRetry) as e:
             max_retries = tool.max_retries if tool is not None else 1
             current_retry = self.ctx.retries.get(name, 0)
diff --git a/tests/models/test_cohere.py b/tests/models/test_cohere.py
@@ -331,7 +331,7 @@ async def get_location(loc_name: str) -> str:
             input_tokens=5,
             output_tokens=3,
             details={'input_tokens': 4, 'output_tokens': 2},
-            tool_calls=2,
+            tool_calls=1,
         )
     )
 
diff --git a/tests/models/test_gemini.py b/tests/models/test_gemini.py
@@ -783,7 +783,7 @@ async def get_location(loc_name: str) -> str:
             ),
         ]
     )
-    assert result.usage() == snapshot(RunUsage(requests=3, input_tokens=3, output_tokens=6, tool_calls=3))
+    assert result.usage() == snapshot(RunUsage(requests=3, input_tokens=3, output_tokens=6, tool_calls=2))
 
 
 async def test_unexpected_response(client_with_handler: ClientWithHandler, env: TestEnv, allow_model_requests: None):
diff --git a/tests/models/test_openai.py b/tests/models/test_openai.py
@@ -423,7 +423,7 @@ async def get_location(loc_name: str) -> str:
         ]
     )
     assert result.usage() == snapshot(
-        RunUsage(requests=3, cache_read_tokens=3, input_tokens=5, output_tokens=3, tool_calls=2)
+        RunUsage(requests=3, cache_read_tokens=3, input_tokens=5, output_tokens=3, tool_calls=1)
     )
 
 
diff --git a/tests/test_usage_limits.py b/tests/test_usage_limits.py
@@ -11,6 +11,7 @@
 from pydantic import BaseModel
 
 from pydantic_ai import Agent, RunContext, UsageLimitExceeded
+from pydantic_ai.exceptions import ModelRetry
 from pydantic_ai.messages import ModelRequest, ModelResponse, ToolCallPart, ToolReturnPart, UserPromptPart
 from pydantic_ai.models.test import TestModel
 from pydantic_ai.output import ToolOutput
@@ -163,7 +164,7 @@ async def delegate_to_other_agent1(ctx: RunContext[None], sentence: str) -> int:
     async def delegate_to_other_agent2(ctx: RunContext[None], sentence: str) -> int:
         delegate_result = await delegate_agent.run(sentence, usage=ctx.usage)
         delegate_usage = delegate_result.usage()
-        assert delegate_usage == snapshot(RunUsage(requests=2, input_tokens=102, output_tokens=9, tool_calls=1))
+        assert delegate_usage == snapshot(RunUsage(requests=2, input_tokens=102, output_tokens=9))
         return delegate_result.output
 
     result2 = await controller_agent2.run('foobar')
@@ -287,3 +288,23 @@ async def another_regular_tool(x: str) -> str:
     result_output = await test_agent_with_output.run('test')
 
     assert result_output.usage() == snapshot(RunUsage(requests=2, input_tokens=103, output_tokens=15, tool_calls=1))
+
+
+async def test_failed_tool_calls_not_counted() -> None:
+    """Test that failed tool calls (raising ModelRetry) are not counted."""
+    test_agent = Agent(TestModel())
+
+    call_count = 0
+
+    @test_agent.tool_plain
+    async def flaky_tool(x: str) -> str:
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            raise ModelRetry('Temporary failure, please retry')
+        return f'{x}-success'
+
+    result = await test_agent.run('test')
+    # The tool was called twice (1 failure + 1 success), but only the successful call should be counted
+    assert call_count == 2
+    assert result.usage() == snapshot(RunUsage(requests=3, input_tokens=176, output_tokens=29, tool_calls=1))

Original file line number	Diff line number	Diff line change
`@@ -331,7 +331,7 @@ async def get_location(loc_name: str) -> str:`
`331`	`331`	`input_tokens=5,`
`332`	`332`	`output_tokens=3,`
`333`	`333`	`details={'input_tokens': 4, 'output_tokens': 2},`
`334`		`- tool_calls=2,`
	`334`	`+ tool_calls=1,`
`335`	`335`	`)`
`336`	`336`	`)`
`337`	`337`
Original file line number	Diff line number	Diff line change
`@@ -783,7 +783,7 @@ async def get_location(loc_name: str) -> str:`
`783`	`783`	`),`
`784`	`784`	`]`
`785`	`785`	`)`
`786`		`- assert result.usage() == snapshot(RunUsage(requests=3, input_tokens=3, output_tokens=6, tool_calls=3))`
	`786`	`+ assert result.usage() == snapshot(RunUsage(requests=3, input_tokens=3, output_tokens=6, tool_calls=2))`
`787`	`787`
`788`	`788`
`789`	`789`	`async def test_unexpected_response(client_with_handler: ClientWithHandler, env: TestEnv, allow_model_requests: None):`
Original file line number	Diff line number	Diff line change
`@@ -423,7 +423,7 @@ async def get_location(loc_name: str) -> str:`
`423`	`423`	`]`
`424`	`424`	`)`
`425`	`425`	`assert result.usage() == snapshot(`
`426`		`- RunUsage(requests=3, cache_read_tokens=3, input_tokens=5, output_tokens=3, tool_calls=2)`
	`426`	`+ RunUsage(requests=3, cache_read_tokens=3, input_tokens=5, output_tokens=3, tool_calls=1)`
`427`	`427`	`)`
`428`	`428`
`429`	`429`