Skip to content

Commit 0b3d020

Browse files
Speed up function _estimate_string_tokens (#2156)
Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
1 parent f3ad3e6 commit 0b3d020

File tree

1 file changed

+15
-16
lines changed

1 file changed

+15
-16
lines changed

pydantic_ai_slim/pydantic_ai/models/function.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
from .. import _utils, usage
1717
from .._utils import PeekableAsyncStream
1818
from ..messages import (
19-
AudioUrl,
2019
BinaryContent,
21-
ImageUrl,
2220
ModelMessage,
2321
ModelRequest,
2422
ModelResponse,
@@ -345,18 +343,19 @@ def _estimate_usage(messages: Iterable[ModelMessage]) -> usage.Usage:
345343
def _estimate_string_tokens(content: str | Sequence[UserContent]) -> int:
346344
if not content:
347345
return 0
346+
348347
if isinstance(content, str):
349-
return len(re.split(r'[\s",.:]+', content.strip()))
350-
else:
351-
tokens = 0
352-
for part in content:
353-
if isinstance(part, str):
354-
tokens += len(re.split(r'[\s",.:]+', part.strip()))
355-
# TODO(Marcelo): We need to study how we can estimate the tokens for these types of content.
356-
if isinstance(part, (AudioUrl, ImageUrl)):
357-
tokens += 0
358-
elif isinstance(part, BinaryContent):
359-
tokens += len(part.data)
360-
else:
361-
tokens += 0
362-
return tokens
348+
return len(_TOKEN_SPLIT_RE.split(content.strip()))
349+
350+
tokens = 0
351+
for part in content:
352+
if isinstance(part, str):
353+
tokens += len(_TOKEN_SPLIT_RE.split(part.strip()))
354+
elif isinstance(part, BinaryContent):
355+
tokens += len(part.data)
356+
# TODO(Marcelo): We need to study how we can estimate the tokens for AudioUrl or ImageUrl.
357+
358+
return tokens
359+
360+
361+
_TOKEN_SPLIT_RE = re.compile(r'[\s",.:]+')

0 commit comments

Comments
 (0)