Skip to content

Commit 5505a38

Browse files
committed
Respond to comments
1 parent 11c0459 commit 5505a38

File tree

3 files changed

+24
-16
lines changed

3 files changed

+24
-16
lines changed

typings/fsspec/__init__.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,6 @@ class AbstractFileSystem(RealAbstractFileSystem):
2727
def open(
2828
self, path: str, mode: Literal["w"], *args: Any, **kwargs: Any
2929
) -> io.TextIOWrapper: ...
30+
def exists(self, path: str) -> bool: ...
3031

3132
def url_to_fs(url: str) -> tuple[AbstractFileSystem, str]: ...

util/opentelemetry-util-genai/src/opentelemetry/util/genai/_upload/completion_hook.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515

1616
from __future__ import annotations
1717

18-
import hashlib
18+
import binascii
1919
import logging
2020
import posixpath
2121
import threading
2222
from concurrent.futures import Future, ThreadPoolExecutor
2323
from contextlib import ExitStack
2424
from dataclasses import asdict, dataclass
25-
from functools import partial
25+
from functools import lru_cache, partial
2626
from os import environ
2727
from time import time
2828
from typing import Any, Callable, Final, Literal
@@ -159,15 +159,15 @@ def _calculate_ref_path(
159159
# TODO: experimental with using the trace_id and span_id, or fetching
160160
# gen_ai.response.id from the active span.
161161
system_instruction_hash = None
162-
# Use an md5 hash of the system instructions as a filename, when system instructions are text.
163162
if all(isinstance(x, types.Text) for x in system_instruction):
164-
md5_hash = hashlib.md5()
165-
md5_hash.update(
166-
"\n".join(x.content for x in system_instruction).encode( # pyright: ignore[reportUnknownMemberType, reportAttributeAccessIssue, reportUnknownArgumentType]
167-
"utf-8"
163+
# Get a checksum of the text.
164+
system_instruction_hash = hex(
165+
binascii.crc32(
166+
"\n".join(x.content for x in system_instruction).encode( # pyright: ignore[reportUnknownMemberType, reportAttributeAccessIssue, reportUnknownArgumentType]
167+
"utf-8"
168+
)
168169
)
169170
)
170-
system_instruction_hash = md5_hash.hexdigest()
171171
uuid_str = str(uuid4())
172172
return CompletionRefs(
173173
inputs_ref=posixpath.join(
@@ -182,12 +182,17 @@ def _calculate_ref_path(
182182
),
183183
)
184184

185+
@lru_cache(maxsize=512)
186+
def _file_exists(self, path: str) -> bool:
187+
# https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.exists
188+
return self._fs.exists(path)
189+
185190
def _do_upload(
186191
self, path: str, json_encodeable: Callable[[], JsonEncodeable]
187192
) -> None:
188-
# FileSystem class has this method. Only check for system instructions as that's the only where the filename is a hash.
189-
# https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.exists
190-
if "_system_instruction" in path and self._fs.exists(path): # pyright: ignore[reportUnknownMemberType]
193+
# Only check for system instruction file existence as that's the only file where the filename is a hash
194+
# of the content.
195+
if "_system_instruction" in path and self._file_exists(path):
191196
return
192197
if self._format == "json":
193198
# output as a single line with the json messages array

util/opentelemetry-util-genai/tests/test_upload.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515

1616
# pylint: disable=import-outside-toplevel,no-name-in-module
17-
import hashlib
17+
import binascii
1818
import importlib
1919
import logging
2020
import sys
@@ -171,11 +171,13 @@ def test_system_insruction_is_hashed_to_avoid_reupload(self):
171171
types.Text(content="You are a helpful assistant."),
172172
types.Text(content="You will do your best."),
173173
]
174-
md5_hash = hashlib.md5()
175-
md5_hash.update(
176-
"\n".join(x.content for x in system_instructions).encode("utf-8")
174+
expected_hash = hex(
175+
binascii.crc32(
176+
"\n".join(x.content for x in system_instructions).encode(
177+
"utf-8"
178+
)
179+
)
177180
)
178-
expected_hash = md5_hash.hexdigest()
179181
record = LogRecord()
180182
self.hook.on_completion(
181183
inputs=[],

0 commit comments

Comments
 (0)