-
Notifications
You must be signed in to change notification settings - Fork 1.5k
add identifier field support to FileUrl and subclasses #2636
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
02ad24b
39a402b
ad500b0
3211a2b
1427dbf
b80788a
ea2f049
f61233a
f2686de
a4f4f07
0905d20
737b3bc
31f2035
0d0e8a4
5e9b713
232c085
92a2645
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| from __future__ import annotations as _annotations | ||
|
|
||
| import base64 | ||
| import hashlib | ||
| from abc import ABC, abstractmethod | ||
| from collections.abc import Sequence | ||
| from dataclasses import KW_ONLY, dataclass, field, replace | ||
|
|
@@ -88,6 +89,13 @@ def otel_message_parts(self, settings: InstrumentationSettings) -> list[_otel_me | |
| __repr__ = _utils.dataclasses_no_defaults_repr | ||
|
|
||
|
|
||
| def _multi_modal_content_identifier(identifier: str | bytes) -> str: | ||
| """Generate stable identifier for multi-modal content to help LLM in finding a specific file in tool call responses.""" | ||
| if isinstance(identifier, str): | ||
| identifier = identifier.encode('utf-8') | ||
| return hashlib.sha1(identifier).hexdigest()[:6] | ||
|
|
||
|
|
||
| @dataclass(init=False, repr=False) | ||
| class FileUrl(ABC): | ||
| """Abstract base class for any URL-based file.""" | ||
|
|
@@ -115,17 +123,31 @@ class FileUrl(ABC): | |
| compare=False, default=None | ||
| ) | ||
|
|
||
| identifier: str | None = None | ||
| """The identifier of the file, such as a unique ID. generating one from the url if not explicitly set | ||
|
|
||
| This identifier can be provided to the model in a message to allow it to refer to this file in a tool call argument, | ||
| and the tool can look up the file in question by iterating over the message history and finding the matching `FileUrl`. | ||
|
|
||
| This identifier is only automatically passed to the model when the `FileUrl` is returned by a tool. | ||
| If you're passing the `FileUrl` as a user message, it's up to you to include a separate text part with the identifier, | ||
| e.g. "This is file <identifier>:" preceding the `FileUrl`. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| url: str, | ||
| *, | ||
| force_download: bool = False, | ||
| vendor_metadata: dict[str, Any] | None = None, | ||
| media_type: str | None = None, | ||
| identifier: str | None = None, | ||
| ) -> None: | ||
| self.url = url | ||
| self.vendor_metadata = vendor_metadata | ||
| self.force_download = force_download | ||
| self.vendor_metadata = vendor_metadata | ||
| self._media_type = media_type | ||
| self.identifier = identifier or _multi_modal_content_identifier(url) | ||
|
|
||
| @pydantic.computed_field | ||
| @property | ||
|
|
@@ -162,11 +184,12 @@ class VideoUrl(FileUrl): | |
| def __init__( | ||
| self, | ||
| url: str, | ||
| *, | ||
| force_download: bool = False, | ||
| vendor_metadata: dict[str, Any] | None = None, | ||
| media_type: str | None = None, | ||
| kind: Literal['video-url'] = 'video-url', | ||
| *, | ||
| identifier: str | None = None, | ||
| # Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs. | ||
| _media_type: str | None = None, | ||
| ) -> None: | ||
|
|
@@ -175,6 +198,7 @@ def __init__( | |
| force_download=force_download, | ||
| vendor_metadata=vendor_metadata, | ||
| media_type=media_type or _media_type, | ||
| identifier=identifier, | ||
| ) | ||
| self.kind = kind | ||
|
|
||
|
|
@@ -235,11 +259,12 @@ class AudioUrl(FileUrl): | |
| def __init__( | ||
| self, | ||
| url: str, | ||
| *, | ||
| force_download: bool = False, | ||
| vendor_metadata: dict[str, Any] | None = None, | ||
| media_type: str | None = None, | ||
| kind: Literal['audio-url'] = 'audio-url', | ||
| *, | ||
| identifier: str | None = None, | ||
| # Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs. | ||
| _media_type: str | None = None, | ||
| ) -> None: | ||
|
|
@@ -248,6 +273,7 @@ def __init__( | |
| force_download=force_download, | ||
| vendor_metadata=vendor_metadata, | ||
| media_type=media_type or _media_type, | ||
| identifier=identifier, | ||
| ) | ||
| self.kind = kind | ||
|
|
||
|
|
@@ -295,11 +321,12 @@ class ImageUrl(FileUrl): | |
| def __init__( | ||
| self, | ||
| url: str, | ||
| *, | ||
| force_download: bool = False, | ||
| vendor_metadata: dict[str, Any] | None = None, | ||
| media_type: str | None = None, | ||
| kind: Literal['image-url'] = 'image-url', | ||
| *, | ||
| identifier: str | None = None, | ||
| # Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs. | ||
| _media_type: str | None = None, | ||
| ) -> None: | ||
|
|
@@ -308,6 +335,7 @@ def __init__( | |
| force_download=force_download, | ||
| vendor_metadata=vendor_metadata, | ||
| media_type=media_type or _media_type, | ||
| identifier=identifier, | ||
| ) | ||
| self.kind = kind | ||
|
|
||
|
|
@@ -350,11 +378,12 @@ class DocumentUrl(FileUrl): | |
| def __init__( | ||
| self, | ||
| url: str, | ||
| *, | ||
| force_download: bool = False, | ||
| vendor_metadata: dict[str, Any] | None = None, | ||
| media_type: str | None = None, | ||
| kind: Literal['document-url'] = 'document-url', | ||
| *, | ||
| identifier: str | None = None, | ||
| # Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs. | ||
| _media_type: str | None = None, | ||
| ) -> None: | ||
|
|
@@ -363,6 +392,7 @@ def __init__( | |
| force_download=force_download, | ||
| vendor_metadata=vendor_metadata, | ||
| media_type=media_type or _media_type, | ||
| identifier=identifier, | ||
| ) | ||
| self.kind = kind | ||
|
|
||
|
|
@@ -405,24 +435,26 @@ def format(self) -> DocumentFormat: | |
| raise ValueError(f'Unknown document media type: {media_type}') from e | ||
|
|
||
|
|
||
| @dataclass(repr=False) | ||
| @dataclass(init=False, repr=False) | ||
| class BinaryContent: | ||
| """Binary content, e.g. an audio or image file.""" | ||
|
|
||
| data: bytes | ||
| """The binary data.""" | ||
|
|
||
| media_type: AudioMediaType | ImageMediaType | DocumentMediaType | str | ||
| """The media type of the binary data.""" | ||
|
|
||
| _: KW_ONLY | ||
|
|
||
| identifier: str | None = None | ||
| """Identifier for the binary content, such as a URL or unique ID. | ||
| media_type: AudioMediaType | ImageMediaType | DocumentMediaType | str | ||
| """The media type of the binary data.""" | ||
|
|
||
| This identifier can be provided to the model in a message to allow it to refer to this file in a tool call argument, and the tool can look up the file in question by iterating over the message history and finding the matching `BinaryContent`. | ||
| identifier: str | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kousun12 Ay that was unintentional, I was relying on the fact that we always set an
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related issue has been filed: #3103 |
||
| """Identifier for the binary content, such as a unique ID. generating one from the data if not explicitly set | ||
| This identifier can be provided to the model in a message to allow it to refer to this file in a tool call argument, | ||
| and the tool can look up the file in question by iterating over the message history and finding the matching `BinaryContent`. | ||
|
|
||
| This identifier is only automatically passed to the model when the `BinaryContent` is returned by a tool. If you're passing the `BinaryContent` as a user message, it's up to you to include a separate text part with the identifier, e.g. "This is file <identifier>:" preceding the `BinaryContent`. | ||
| This identifier is only automatically passed to the model when the `BinaryContent` is returned by a tool. | ||
| If you're passing the `BinaryContent` as a user message, it's up to you to include a separate text part with the identifier, | ||
| e.g. "This is file <identifier>:" preceding the `BinaryContent`. | ||
| """ | ||
|
|
||
| vendor_metadata: dict[str, Any] | None = None | ||
|
|
@@ -435,6 +467,21 @@ class BinaryContent: | |
| kind: Literal['binary'] = 'binary' | ||
| """Type identifier, this is available on all parts as a discriminator.""" | ||
|
|
||
| def __init__( | ||
| self, | ||
| data: bytes, | ||
| *, | ||
| media_type: AudioMediaType | ImageMediaType | DocumentMediaType | str, | ||
DouweM marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| identifier: str | None = None, | ||
| vendor_metadata: dict[str, Any] | None = None, | ||
| kind: Literal['binary'] = 'binary', | ||
| ) -> None: | ||
| self.data = data | ||
| self.media_type = media_type | ||
| self.identifier = identifier or _multi_modal_content_identifier(data) | ||
| self.vendor_metadata = vendor_metadata | ||
| self.kind = kind | ||
|
|
||
| @property | ||
| def is_audio(self) -> bool: | ||
| """Return `True` if the media type is an audio type.""" | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.