Add tool_calls_limit to UsageLimits and tool_calls to RunUsage (#2633)

tradeqvest · web-flow · commit 881cd7a06049 · 2025-09-03T12:57:57.000-06:00
diff --git a/docs/agents.md b/docs/agents.md
@@ -539,7 +539,7 @@ _(This example is complete, it can be run "as is")_
 #### Usage Limits
 
 Pydantic AI offers a [`UsageLimits`][pydantic_ai.usage.UsageLimits] structure to help you limit your
-usage (tokens and/or requests) on model runs.
+usage (tokens, requests, and tool calls) on model runs.
 
 You can apply these settings by passing the `usage_limits` argument to the `run{_sync,_stream}` functions.
 
@@ -610,8 +610,31 @@ except UsageLimitExceeded as e:
 1. This tool has the ability to retry 5 times before erroring, simulating a tool that might get stuck in a loop.
 2. This run will error after 3 requests, preventing the infinite tool calling.
 
+##### Capping tool calls
+
+If you need a limit on the number of successful tool invocations within a single run, use `tool_calls_limit`:
+
+```py
+from pydantic_ai import Agent
+from pydantic_ai.exceptions import UsageLimitExceeded
+from pydantic_ai.usage import UsageLimits
+
+agent = Agent('anthropic:claude-3-5-sonnet-latest')
+
+@agent.tool_plain
+def do_work() -> str:
+    return 'ok'
+
+try:
+    # Allow at most one executed tool call in this run
+    agent.run_sync('Please call the tool twice', usage_limits=UsageLimits(tool_calls_limit=1))
+except UsageLimitExceeded as e:
+    print(e)
+    #> The next tool call would exceed the tool_calls_limit of 1 (tool_calls=1)
+```
+
 !!! note
-    - Usage limits are especially relevant if you've registered many tools. The `request_limit` can be used to prevent the model from calling them in a loop too many times.
+    - Usage limits are especially relevant if you've registered many tools. Use `request_limit` to bound the number of model turns, and `tool_calls_limit` to cap the number of successful tool executions within a run.
     - These limits are enforced at the final stage before the LLM is called. If your limits are stricter than your retry settings, the usage limit will be reached before all retries are attempted.
 
 #### Model (Run) Settings
diff --git a/docs/multi-agent-applications.md b/docs/multi-agent-applications.md
@@ -19,7 +19,7 @@ Since agents are stateless and designed to be global, you do not need to include
 You'll generally want to pass [`ctx.usage`][pydantic_ai.RunContext.usage] to the [`usage`][pydantic_ai.agent.AbstractAgent.run] keyword argument of the delegate agent run so usage within that run counts towards the total usage of the parent agent run.
 
 !!! note "Multiple models"
-    Agent delegation doesn't need to use the same model for each agent. If you choose to use different models within a run, calculating the monetary cost from the final [`result.usage()`][pydantic_ai.agent.AgentRunResult.usage] of the run will not be possible, but you can still use [`UsageLimits`][pydantic_ai.usage.UsageLimits] to avoid unexpected costs.
+    Agent delegation doesn't need to use the same model for each agent. If you choose to use different models within a run, calculating the monetary cost from the final [`result.usage()`][pydantic_ai.agent.AgentRunResult.usage] of the run will not be possible, but you can still use [`UsageLimits`][pydantic_ai.usage.UsageLimits] — including `request_limit`, `total_tokens_limit`, and `tool_calls_limit` — to avoid unexpected costs or runaway tool loops.
 
 ```python {title="agent_delegation_simple.py"}
 from pydantic_ai import Agent, RunContext, UsageLimits
@@ -52,7 +52,7 @@ result = joke_selection_agent.run_sync(
 print(result.output)
 #> Did you hear about the toothpaste scandal? They called it Colgate.
 print(result.usage())
-#> RunUsage(input_tokens=204, output_tokens=24, requests=3)
+#> RunUsage(input_tokens=204, output_tokens=24, requests=3, tool_calls=1)
 ```
 
 1. The "parent" or controlling agent.
@@ -143,7 +143,7 @@ async def main():
         print(result.output)
         #> Did you hear about the toothpaste scandal? They called it Colgate.
         print(result.usage())  # (6)!
-        #> RunUsage(input_tokens=309, output_tokens=32, requests=4)
+        #> RunUsage(input_tokens=309, output_tokens=32, requests=4, tool_calls=2)
 ```
 
 1. Define a dataclass to hold the client and API key dependencies.
diff --git a/docs/tools-advanced.md b/docs/tools-advanced.md
@@ -377,6 +377,9 @@ When a model returns multiple tool calls in one response, Pydantic AI schedules
 
 Async functions are run on the event loop, while sync functions are offloaded to threads. To get the best performance, _always_ use an async function _unless_ you're doing blocking I/O (and there's no way to use a non-blocking library instead) or CPU-bound work (like `numpy` or `scikit-learn` operations), so that simple functions are not offloaded to threads unnecessarily.
 
+!!! note "Limiting tool executions"
+    You can cap tool executions within a run using [`UsageLimits(tool_calls_limit=...)`](agents.md#usage-limits). The counter increments only after a successful tool invocation. Output tools (used for structured output) are not counted in the `tool_calls` metric.
+
 ## See Also
 
 - [Function Tools](tools.md) - Basic tool concepts and registration
diff --git a/pydantic_ai_slim/pydantic_ai/_agent_graph.py b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
@@ -756,6 +756,7 @@ async def process_function_tools(  # noqa: C901
             calls_to_run,
             deferred_tool_results,
             ctx.deps.tracer,
+            ctx.deps.usage_limits,
             output_parts,
             deferred_calls,
         ):
@@ -802,6 +803,7 @@ async def _call_tools(
     tool_calls: list[_messages.ToolCallPart],
     deferred_tool_results: dict[str, DeferredToolResult],
     tracer: Tracer,
+    usage_limits: _usage.UsageLimits | None,
     output_parts: list[_messages.ModelRequestPart],
     output_deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]],
 ) -> AsyncIterator[_messages.HandleResponseEvent]:
@@ -822,7 +824,7 @@ async def _call_tools(
     ):
         tasks = [
             asyncio.create_task(
-                _call_tool(tool_manager, call, deferred_tool_results.get(call.tool_call_id)),
+                _call_tool(tool_manager, call, deferred_tool_results.get(call.tool_call_id), usage_limits),
                 name=call.tool_name,
             )
             for call in tool_calls
@@ -862,14 +864,15 @@ async def _call_tool(
     tool_manager: ToolManager[DepsT],
     tool_call: _messages.ToolCallPart,
     tool_call_result: DeferredToolResult | None,
+    usage_limits: _usage.UsageLimits | None,
 ) -> tuple[_messages.ToolReturnPart | _messages.RetryPromptPart, _messages.UserPromptPart | None]:
     try:
         if tool_call_result is None:
-            tool_result = await tool_manager.handle_call(tool_call)
+            tool_result = await tool_manager.handle_call(tool_call, usage_limits=usage_limits)
         elif isinstance(tool_call_result, ToolApproved):
             if tool_call_result.override_args is not None:
                 tool_call = dataclasses.replace(tool_call, args=tool_call_result.override_args)
-            tool_result = await tool_manager.handle_call(tool_call)
+            tool_result = await tool_manager.handle_call(tool_call, usage_limits=usage_limits)
         elif isinstance(tool_call_result, ToolDenied):
             return _messages.ToolReturnPart(
                 tool_name=tool_call.tool_name,
diff --git a/pydantic_ai_slim/pydantic_ai/_tool_manager.py b/pydantic_ai_slim/pydantic_ai/_tool_manager.py
@@ -14,6 +14,7 @@
 from .messages import ToolCallPart
 from .tools import ToolDefinition
 from .toolsets.abstract import AbstractToolset, ToolsetTool
+from .usage import UsageLimits
 
 
 @dataclass
@@ -66,31 +67,44 @@ def get_tool_def(self, name: str) -> ToolDefinition | None:
             return None
 
     async def handle_call(
-        self, call: ToolCallPart, allow_partial: bool = False, wrap_validation_errors: bool = True
+        self,
+        call: ToolCallPart,
+        allow_partial: bool = False,
+        wrap_validation_errors: bool = True,
+        usage_limits: UsageLimits | None = None,
     ) -> Any:
         """Handle a tool call by validating the arguments, calling the tool, and handling retries.
 
         Args:
             call: The tool call part to handle.
             allow_partial: Whether to allow partial validation of the tool arguments.
             wrap_validation_errors: Whether to wrap validation errors in a retry prompt part.
+            usage_limits: Optional usage limits to check before executing tools.
         """
         if self.tools is None or self.ctx is None:
             raise ValueError('ToolManager has not been prepared for a run step yet')  # pragma: no cover
 
         if (tool := self.tools.get(call.tool_name)) and tool.tool_def.kind == 'output':
-            # Output tool calls are not traced
-            return await self._call_tool(call, allow_partial, wrap_validation_errors)
+            # Output tool calls are not traced and not counted
+            return await self._call_tool(call, allow_partial, wrap_validation_errors, count_tool_usage=False)
         else:
             return await self._call_tool_traced(
                 call,
                 allow_partial,
                 wrap_validation_errors,
                 self.ctx.tracer,
                 self.ctx.trace_include_content,
+                usage_limits,
             )
 
-    async def _call_tool(self, call: ToolCallPart, allow_partial: bool, wrap_validation_errors: bool) -> Any:
+    async def _call_tool(
+        self,
+        call: ToolCallPart,
+        allow_partial: bool,
+        wrap_validation_errors: bool,
+        usage_limits: UsageLimits | None = None,
+        count_tool_usage: bool = True,
+    ) -> Any:
         if self.tools is None or self.ctx is None:
             raise ValueError('ToolManager has not been prepared for a run step yet')  # pragma: no cover
 
@@ -121,7 +135,15 @@ async def _call_tool(self, call: ToolCallPart, allow_partial: bool, wrap_validat
             else:
                 args_dict = validator.validate_python(call.args or {}, allow_partial=pyd_allow_partial)
 
-            return await self.toolset.call_tool(name, args_dict, ctx, tool)
+            if usage_limits is not None and count_tool_usage:
+                usage_limits.check_before_tool_call(self.ctx.usage)
+
+            result = await self.toolset.call_tool(name, args_dict, ctx, tool)
+
+            if count_tool_usage:
+                self.ctx.usage.tool_calls += 1
+
+            return result
         except (ValidationError, ModelRetry) as e:
             max_retries = tool.max_retries if tool is not None else 1
             current_retry = self.ctx.retries.get(name, 0)
@@ -160,6 +182,7 @@ async def _call_tool_traced(
         wrap_validation_errors: bool,
         tracer: Tracer,
         include_content: bool = False,
+        usage_limits: UsageLimits | None = None,
     ) -> Any:
         """See <https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span>."""
         span_attributes = {
@@ -189,7 +212,7 @@ async def _call_tool_traced(
         }
         with tracer.start_as_current_span('running tool', attributes=span_attributes) as span:
             try:
-                tool_result = await self._call_tool(call, allow_partial, wrap_validation_errors)
+                tool_result = await self._call_tool(call, allow_partial, wrap_validation_errors, usage_limits)
             except ToolRetryError as e:
                 part = e.tool_retry
                 if include_content and span.is_recording():
diff --git a/pydantic_ai_slim/pydantic_ai/usage.py b/pydantic_ai_slim/pydantic_ai/usage.py
@@ -117,6 +117,9 @@ class RunUsage(UsageBase):
     requests: int = 0
     """Number of requests made to the LLM API."""
 
+    tool_calls: int = 0
+    """Number of successful tool calls executed during the run."""
+
     input_tokens: int = 0
     """Total number of text input/prompt tokens."""
 
@@ -146,6 +149,7 @@ def incr(self, incr_usage: RunUsage | RequestUsage) -> None:
         """
         if isinstance(incr_usage, RunUsage):
             self.requests += incr_usage.requests
+            self.tool_calls += incr_usage.tool_calls
         return _incr_usage_tokens(self, incr_usage)
 
     def __add__(self, other: RunUsage | RequestUsage) -> RunUsage:
@@ -194,6 +198,8 @@ class UsageLimits:
 
     request_limit: int | None = 50
     """The maximum number of requests allowed to the model."""
+    tool_calls_limit: int | None = None
+    """The maximum number of successful tool calls allowed to be executed."""
     input_tokens_limit: int | None = None
     """The maximum number of input/prompt tokens allowed."""
     output_tokens_limit: int | None = None
@@ -220,12 +226,14 @@ def __init__(
         self,
         *,
         request_limit: int | None = 50,
+        tool_calls_limit: int | None = None,
         input_tokens_limit: int | None = None,
         output_tokens_limit: int | None = None,
         total_tokens_limit: int | None = None,
         count_tokens_before_request: bool = False,
     ) -> None:
         self.request_limit = request_limit
+        self.tool_calls_limit = tool_calls_limit
         self.input_tokens_limit = input_tokens_limit
         self.output_tokens_limit = output_tokens_limit
         self.total_tokens_limit = total_tokens_limit
@@ -239,12 +247,14 @@ def __init__(
         self,
         *,
         request_limit: int | None = 50,
+        tool_calls_limit: int | None = None,
         request_tokens_limit: int | None = None,
         response_tokens_limit: int | None = None,
         total_tokens_limit: int | None = None,
         count_tokens_before_request: bool = False,
     ) -> None:
         self.request_limit = request_limit
+        self.tool_calls_limit = tool_calls_limit
         self.input_tokens_limit = request_tokens_limit
         self.output_tokens_limit = response_tokens_limit
         self.total_tokens_limit = total_tokens_limit
@@ -254,6 +264,7 @@ def __init__(
         self,
         *,
         request_limit: int | None = 50,
+        tool_calls_limit: int | None = None,
         input_tokens_limit: int | None = None,
         output_tokens_limit: int | None = None,
         total_tokens_limit: int | None = None,
@@ -263,6 +274,7 @@ def __init__(
         response_tokens_limit: int | None = None,
     ):
         self.request_limit = request_limit
+        self.tool_calls_limit = tool_calls_limit
         self.input_tokens_limit = input_tokens_limit or request_tokens_limit
         self.output_tokens_limit = output_tokens_limit or response_tokens_limit
         self.total_tokens_limit = total_tokens_limit
@@ -314,4 +326,12 @@ def check_tokens(self, usage: RunUsage) -> None:
         if self.total_tokens_limit is not None and total_tokens > self.total_tokens_limit:
             raise UsageLimitExceeded(f'Exceeded the total_tokens_limit of {self.total_tokens_limit} ({total_tokens=})')
 
+    def check_before_tool_call(self, usage: RunUsage) -> None:
+        """Raises a `UsageLimitExceeded` exception if the next tool call would exceed the tool call limit."""
+        tool_calls_limit = self.tool_calls_limit
+        if tool_calls_limit is not None and usage.tool_calls >= tool_calls_limit:
+            raise UsageLimitExceeded(
+                f'The next tool call would exceed the tool_calls_limit of {tool_calls_limit} (tool_calls={usage.tool_calls})'
+            )
+
     __repr__ = _utils.dataclasses_no_defaults_repr
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py
@@ -667,6 +667,7 @@ async def my_tool(first: str, second: str) -> int:
                 requests=2,
                 input_tokens=20,
                 output_tokens=5,
+                tool_calls=1,
                 details={'input_tokens': 20, 'output_tokens': 5},
             )
         )
diff --git a/tests/models/test_bedrock.py b/tests/models/test_bedrock.py
@@ -112,7 +112,7 @@ async def temperature(city: str, date: datetime.date) -> str:
 
     result = await agent.run('What was the temperature in London 1st January 2022?', output_type=Response)
     assert result.output == snapshot({'temperature': '30°C', 'date': datetime.date(2022, 1, 1), 'city': 'London'})
-    assert result.usage() == snapshot(RunUsage(requests=2, input_tokens=1236, output_tokens=298))
+    assert result.usage() == snapshot(RunUsage(requests=2, input_tokens=1236, output_tokens=298, tool_calls=1))
     assert result.all_messages() == snapshot(
         [
             ModelRequest(
diff --git a/tests/models/test_cohere.py b/tests/models/test_cohere.py
@@ -330,6 +330,7 @@ async def get_location(loc_name: str) -> str:
             input_tokens=5,
             output_tokens=3,
             details={'input_tokens': 4, 'output_tokens': 2},
+            tool_calls=1,
         )
     )
 
diff --git a/tests/models/test_gemini.py b/tests/models/test_gemini.py
@@ -783,7 +783,7 @@ async def get_location(loc_name: str) -> str:
             ),
         ]
     )
-    assert result.usage() == snapshot(RunUsage(requests=3, input_tokens=3, output_tokens=6))
+    assert result.usage() == snapshot(RunUsage(requests=3, input_tokens=3, output_tokens=6, tool_calls=2))
 
 
 async def test_unexpected_response(client_with_handler: ClientWithHandler, env: TestEnv, allow_model_requests: None):
@@ -932,7 +932,7 @@ async def bar(y: str) -> str:
     async with agent.run_stream('Hello') as result:
         response = await result.get_output()
         assert response == snapshot((1, 2))
-    assert result.usage() == snapshot(RunUsage(requests=2, input_tokens=2, output_tokens=4))
+    assert result.usage() == snapshot(RunUsage(requests=2, input_tokens=2, output_tokens=4, tool_calls=2))
     assert result.all_messages() == snapshot(
         [
             ModelRequest(parts=[UserPromptPart(content='Hello', timestamp=IsNow(tz=timezone.utc))]),
diff --git a/tests/models/test_google.py b/tests/models/test_google.py
@@ -149,6 +149,7 @@ async def temperature(city: str, date: datetime.date) -> str:
             requests=2,
             input_tokens=224,
             output_tokens=35,
+            tool_calls=1,
             details={'text_prompt_tokens': 224, 'text_candidates_tokens': 35},
         )
     )
diff --git a/tests/models/test_openai.py b/tests/models/test_openai.py
@@ -346,7 +346,9 @@ async def get_location(loc_name: str) -> str:
             ),
         ]
     )
-    assert result.usage() == snapshot(RunUsage(requests=3, cache_read_tokens=3, input_tokens=5, output_tokens=3))
+    assert result.usage() == snapshot(
+        RunUsage(requests=3, cache_read_tokens=3, input_tokens=5, output_tokens=3, tool_calls=1)
+    )
 
 
 FinishReason = Literal['stop', 'length', 'tool_calls', 'content_filter', 'function_call']
diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -387,6 +387,7 @@ async def call_tool(
         'The capital of Italy is Rome (Roma, in Italian), which has been a cultural and political center for centuries.'
         'Rome is known for its rich history, stunning architecture, and delicious cuisine.'
     ),
+    'Please call the tool twice': ToolCallPart(tool_name='do_work', args={}, tool_call_id='pyd_ai_tool_call_id'),
     'Begin infinite retry loop!': ToolCallPart(
         tool_name='infinite_retry_tool', args={}, tool_call_id='pyd_ai_tool_call_id'
     ),
@@ -660,6 +661,8 @@ async def model_logic(  # noqa: C901
         return ModelResponse(
             parts=[ToolCallPart(tool_name='final_result', args=args, tool_call_id='pyd_ai_tool_call_id')]
         )
+    elif isinstance(m, ToolReturnPart) and m.tool_name == 'do_work':
+        return ModelResponse(parts=[ToolCallPart(tool_name='do_work', args={}, tool_call_id='pyd_ai_tool_call_id')])
     elif isinstance(m, RetryPromptPart) and m.tool_name == 'calc_volume':
         return ModelResponse(
             parts=[ToolCallPart(tool_name='calc_volume', args={'size': 6}, tool_call_id='pyd_ai_tool_call_id')]
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
@@ -83,6 +83,7 @@ async def ret_a(x: str) -> str:
                 requests=2,
                 input_tokens=103,
                 output_tokens=5,
+                tool_calls=1,
             )
         )
         response = await result.get_output()
@@ -120,6 +121,7 @@ async def ret_a(x: str) -> str:
                 requests=2,
                 input_tokens=103,
                 output_tokens=11,
+                tool_calls=1,
             )
         )
 
diff --git a/tests/test_usage_limits.py b/tests/test_usage_limits.py

Original file line number	Diff line number	Diff line change
`@@ -667,6 +667,7 @@ async def my_tool(first: str, second: str) -> int:`
`667`	`667`	`requests=2,`
`668`	`668`	`input_tokens=20,`
`669`	`669`	`output_tokens=5,`
	`670`	`+ tool_calls=1,`
`670`	`671`	`details={'input_tokens': 20, 'output_tokens': 5},`
`671`	`672`	`)`
`672`	`673`	`)`
Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,7 @@ async def temperature(city: str, date: datetime.date) -> str:`
`112`	`112`
`113`	`113`	`result = await agent.run('What was the temperature in London 1st January 2022?', output_type=Response)`
`114`	`114`	`assert result.output == snapshot({'temperature': '30°C', 'date': datetime.date(2022, 1, 1), 'city': 'London'})`
`115`		`- assert result.usage() == snapshot(RunUsage(requests=2, input_tokens=1236, output_tokens=298))`
	`115`	`+ assert result.usage() == snapshot(RunUsage(requests=2, input_tokens=1236, output_tokens=298, tool_calls=1))`
`116`	`116`	`assert result.all_messages() == snapshot(`
`117`	`117`	`[`
`118`	`118`	`ModelRequest(`
Original file line number	Diff line number	Diff line change
`@@ -330,6 +330,7 @@ async def get_location(loc_name: str) -> str:`
`330`	`330`	`input_tokens=5,`
`331`	`331`	`output_tokens=3,`
`332`	`332`	`details={'input_tokens': 4, 'output_tokens': 2},`
	`333`	`+ tool_calls=1,`
`333`	`334`	`)`
`334`	`335`	`)`
`335`	`336`
Original file line number	Diff line number	Diff line change
`@@ -783,7 +783,7 @@ async def get_location(loc_name: str) -> str:`
`783`	`783`	`),`
`784`	`784`	`]`
`785`	`785`	`)`
`786`		`- assert result.usage() == snapshot(RunUsage(requests=3, input_tokens=3, output_tokens=6))`
	`786`	`+ assert result.usage() == snapshot(RunUsage(requests=3, input_tokens=3, output_tokens=6, tool_calls=2))`
`787`	`787`
`788`	`788`
`789`	`789`	`async def test_unexpected_response(client_with_handler: ClientWithHandler, env: TestEnv, allow_model_requests: None):`
`@@ -932,7 +932,7 @@ async def bar(y: str) -> str:`
`932`	`932`	`async with agent.run_stream('Hello') as result:`
`933`	`933`	`response = await result.get_output()`
`934`	`934`	`assert response == snapshot((1, 2))`
`935`		`- assert result.usage() == snapshot(RunUsage(requests=2, input_tokens=2, output_tokens=4))`
	`935`	`+ assert result.usage() == snapshot(RunUsage(requests=2, input_tokens=2, output_tokens=4, tool_calls=2))`
`936`	`936`	`assert result.all_messages() == snapshot(`
`937`	`937`	`[`
`938`	`938`	`ModelRequest(parts=[UserPromptPart(content='Hello', timestamp=IsNow(tz=timezone.utc))]),`