Skip to content

Commit a756618

Browse files
authored
Vendor Hasher class from datasets library in order to remove larger dependency (#8621)
* Vendor Hasher class from datasets library in order to remove larger dependency * Run ruff - add change notice for Apache
1 parent e9d8c04 commit a756618

File tree

5 files changed

+50
-6
lines changed

5 files changed

+50
-6
lines changed

dspy/clients/utils_finetune.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def write_lines(file_path, data):
6666
def save_data(
6767
data: list[dict[str, Any]],
6868
) -> str:
69-
from datasets.fingerprint import Hasher
69+
from dspy.utils.hasher import Hasher
7070

7171
# Assign a unique name to the file based on the data hash
7272
hash = Hasher.hash(data)

dspy/teleprompt/bootstrap.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def _bootstrap_one_example(self, example, round_idx=0):
241241
# If there are multiple traces for the same predictor in the sample example,
242242
# sample 50/50 from the first N-1 traces or the last trace.
243243
if len(demos) > 1:
244-
from datasets.fingerprint import Hasher
244+
from dspy.utils.hasher import Hasher
245245

246246
rng = random.Random(Hasher.hash(tuple(demos)))
247247
demos = [rng.choice(demos[:-1]) if rng.random() < 0.5 else demos[-1]]

dspy/utils/hasher.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from pickle import dumps
2+
from typing import Any
3+
4+
import xxhash
5+
6+
"""
7+
The following class was pulled from the `datasets` package from Hugging Face.
8+
The reason for vendoring this code is to avoid a hard dependency on `datasets`,
9+
which is a large package that is not needed for the majority of use cases.
10+
11+
License: Apache License 2.0
12+
Author: Hugging Face Inc.
13+
URL: https://github.com/huggingface/datasets/blob/fa73ab472eecf9136a3daf7a0fbff16a3dffa7a6/src/datasets/fingerprint.py#L170
14+
Changes: 2025-08-10 - Ran ruff to format the code to DSPy styles.
15+
"""
16+
class Hasher:
17+
"""Hasher that accepts python objects as inputs."""
18+
19+
dispatch: dict = {}
20+
21+
def __init__(self):
22+
self.m = xxhash.xxh64()
23+
24+
@classmethod
25+
def hash_bytes(cls, value: bytes | list[bytes]) -> str:
26+
value = [value] if isinstance(value, bytes) else value
27+
m = xxhash.xxh64()
28+
for x in value:
29+
m.update(x)
30+
return m.hexdigest()
31+
32+
@classmethod
33+
def hash(cls, value: Any) -> str:
34+
return cls.hash_bytes(dumps(value))
35+
36+
def update(self, value: Any) -> None:
37+
header_for_update = f"=={type(value)}=="
38+
value_for_update = self.hash(value)
39+
self.m.update(header_for_update.encode("utf8"))
40+
self.m.update(value_for_update.encode("utf-8"))
41+
42+
def hexdigest(self) -> str:
43+
return self.m.hexdigest()

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@ dependencies = [
2424
"backoff>=2.2",
2525
"joblib~=1.3",
2626
"openai>=0.28.1",
27-
"datasets>=2.14.6", # needed for Bootstrap's Hasher
2827
"regex>=2023.10.3",
29-
"datasets>=2.14.6", # needed for Bootstrap's Hasher
3028
"ujson>=5.8.0",
3129
"tqdm>=4.66.1",
3230
"requests>=2.31.0",
@@ -43,6 +41,7 @@ dependencies = [
4341
"cloudpickle>=3.0.0",
4442
"rich>=13.7.1",
4543
"numpy>=1.26.0",
44+
"xxhash>=3.5.0",
4645
]
4746

4847
[project.optional-dependencies]

uv.lock

Lines changed: 4 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)