Skip to content

Commit 835e43d

Browse files
perf: use blake3 instead of sha256
Signed-off-by: thiswillbeyourgithub <[email protected]>
1 parent 9276995 commit 835e43d

File tree

3 files changed

+8
-7
lines changed

3 files changed

+8
-7
lines changed

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ def run(self):
165165
"uuid6 >= 2025.0.1", # for time sortable timestamp
166166
"PersistDict >= 0.2.14", # by me, like a dict but an LMDB database, to fix langchain's caches
167167
"nltk>=3.9.2", # needed for punkt_tab download in post-install
168+
"blake3>=1.0.8", # faster than sha256
168169
"pandas >= 2.3.3",
169170
# some loaders are included by default:
170171
"playwright >= 1.45.0", # for online_media and urls

tests/test_parsing.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
22
import tempfile
3-
import hashlib
3+
from blake3 import blake3
44
from pathlib import Path
55

66
import pytest
@@ -134,9 +134,9 @@ def test_parse_docx():
134134
f.write(response.content)
135135

136136
# Verify SHA512 checksum
137-
expected_hash = "64b73b409688cc5b5675c07d9df4b83d353fa85026a9d686d6725e50f388930e1d57c56cc6cfebd5f2cecc06d7ef89ae7495bd5411ca0eac4b0df63a7d6c82dc"
137+
expected_hash = "e69f84a2b494c767255d5d6934e3d12648fccb7dbc06e1cd47cad1b1c6efc902"
138138
with open(tmp_path, "rb") as f:
139-
file_hash = hashlib.sha512(f.read()).hexdigest()
139+
file_hash = blake3(f.read()).hexdigest()
140140
assert file_hash == expected_hash, (
141141
f"File hash {file_hash} does not match expected {expected_hash}"
142142
)

wdoc/utils/misc.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Miscellanous functions etc.
33
"""
44

5-
import hashlib
5+
from blake3 import blake3
66
import re
77
import inspect
88
import json
@@ -359,7 +359,7 @@ def wrapper(*args, **kwargs):
359359
def hasher(text: str) -> str:
360360
"""used to hash the text contant of each doc to cache the splitting and
361361
embeddings"""
362-
return hashlib.sha256(text.encode()).hexdigest()[:20]
362+
return blake3(text.encode()).hexdigest()[:20]
363363

364364

365365
def file_hasher(doc: dict) -> str:
@@ -396,7 +396,7 @@ def file_hasher(doc: dict) -> str:
396396
@hashdoc_cache.cache
397397
def _file_hasher(abs_path: str, stats: List[Union[int, float]]) -> str:
398398
with open(abs_path, "rb") as f:
399-
return hashlib.sha256(f.read()).hexdigest()[:20]
399+
return blake3(f.read()).hexdigest()[:20]
400400

401401

402402
def html_to_text(html: str, remove_image: bool = False) -> str:
@@ -614,7 +614,7 @@ def __post_init__(self):
614614
with open(
615615
Path(self.model).resolve().absolute().__str__(), "rb"
616616
) as f:
617-
h = hashlib.sha256(f.read() + str(self.model)).hexdigest()[:15]
617+
h = blake3(f.read() + str(self.model)).hexdigest()[:15]
618618
self.sanitized = Path(self.model).name + "_" + h
619619
except Exception:
620620
pass

0 commit comments

Comments
 (0)