|
16 | 16 | from .. import _utils, usage
|
17 | 17 | from .._utils import PeekableAsyncStream
|
18 | 18 | from ..messages import (
|
19 |
| - AudioUrl, |
20 | 19 | BinaryContent,
|
21 |
| - ImageUrl, |
22 | 20 | ModelMessage,
|
23 | 21 | ModelRequest,
|
24 | 22 | ModelResponse,
|
@@ -345,18 +343,19 @@ def _estimate_usage(messages: Iterable[ModelMessage]) -> usage.Usage:
|
345 | 343 | def _estimate_string_tokens(content: str | Sequence[UserContent]) -> int:
|
346 | 344 | if not content:
|
347 | 345 | return 0
|
| 346 | + |
348 | 347 | if isinstance(content, str):
|
349 |
| - return len(re.split(r'[\s",.:]+', content.strip())) |
350 |
| - else: |
351 |
| - tokens = 0 |
352 |
| - for part in content: |
353 |
| - if isinstance(part, str): |
354 |
| - tokens += len(re.split(r'[\s",.:]+', part.strip())) |
355 |
| - # TODO(Marcelo): We need to study how we can estimate the tokens for these types of content. |
356 |
| - if isinstance(part, (AudioUrl, ImageUrl)): |
357 |
| - tokens += 0 |
358 |
| - elif isinstance(part, BinaryContent): |
359 |
| - tokens += len(part.data) |
360 |
| - else: |
361 |
| - tokens += 0 |
362 |
| - return tokens |
| 348 | + return len(_TOKEN_SPLIT_RE.split(content.strip())) |
| 349 | + |
| 350 | + tokens = 0 |
| 351 | + for part in content: |
| 352 | + if isinstance(part, str): |
| 353 | + tokens += len(_TOKEN_SPLIT_RE.split(part.strip())) |
| 354 | + elif isinstance(part, BinaryContent): |
| 355 | + tokens += len(part.data) |
| 356 | + # TODO(Marcelo): We need to study how we can estimate the tokens for AudioUrl or ImageUrl. |
| 357 | + |
| 358 | + return tokens |
| 359 | + |
| 360 | + |
| 361 | +_TOKEN_SPLIT_RE = re.compile(r'[\s",.:]+') |
0 commit comments