12
12
import hashlib
13
13
import json
14
14
import uuid
15
+ import warnings
15
16
from collections .abc import Sequence
16
- from functools import partial
17
- from typing import Callable , Optional , Union , cast
17
+ from typing import Callable , Literal , Optional , Union , cast
18
18
19
19
from langchain_core .embeddings import Embeddings
20
20
from langchain_core .stores import BaseStore , ByteStore
25
25
NAMESPACE_UUID = uuid .UUID (int = 1985 )
26
26
27
27
28
- def _hash_string_to_uuid (input_string : str ) -> uuid .UUID :
29
- """Hash a string and returns the corresponding UUID."""
30
- hash_value = hashlib .sha1 (input_string .encode ("utf-8" )).hexdigest ()
31
- return uuid .uuid5 (NAMESPACE_UUID , hash_value )
28
+ def _sha1_hash_to_uuid (text : str ) -> uuid .UUID :
29
+ """Return a UUID derived from *text* using SHA‑1 (deterministic).
32
30
31
+ Deterministic and fast, **but not collision‑resistant**.
33
32
34
- def _key_encoder (key : str , namespace : str ) -> str :
35
- """Encode a key."""
36
- return namespace + str (_hash_string_to_uuid (key ))
33
+ A malicious attacker could try to create two different texts that hash to the same
34
+ UUID. This may not necessarily be an issue in the context of caching embeddings,
35
+ but new applications should swap this out for a stronger hash function like
36
+ xxHash, BLAKE2 or SHA‑256, which are collision-resistant.
37
+ """
38
+ sha1_hex = hashlib .sha1 (text .encode ("utf-8" )).hexdigest ()
39
+ # Embed the hex string in `uuid5` to obtain a valid UUID.
40
+ return uuid .uuid5 (NAMESPACE_UUID , sha1_hex )
41
+
42
+
43
+ def _make_default_key_encoder (namespace : str , algorithm : str ) -> Callable [[str ], str ]:
44
+ """Create a default key encoder function.
45
+
46
+ Args:
47
+ namespace: Prefix that segregates keys from different embedding models.
48
+ algorithm:
49
+ * `sha1` - fast but not collision‑resistant
50
+ * `blake2b` - cryptographically strong, faster than SHA‑1
51
+ * `sha256` - cryptographically strong, slower than SHA‑1
52
+ * `sha512` - cryptographically strong, slower than SHA‑1
37
53
54
+ Returns:
55
+ A function that encodes a key using the specified algorithm.
56
+ """
57
+ if algorithm == "sha1" :
58
+ _warn_about_sha1_encoder ()
59
+
60
+ def _key_encoder (key : str ) -> str :
61
+ """Encode a key using the specified algorithm."""
62
+ if algorithm == "sha1" :
63
+ return f"{ namespace } { _sha1_hash_to_uuid (key )} "
64
+ if algorithm == "blake2b" :
65
+ return f"{ namespace } { hashlib .blake2b (key .encode ('utf-8' )).hexdigest ()} "
66
+ if algorithm == "sha256" :
67
+ return f"{ namespace } { hashlib .sha256 (key .encode ('utf-8' )).hexdigest ()} "
68
+ if algorithm == "sha512" :
69
+ return f"{ namespace } { hashlib .sha512 (key .encode ('utf-8' )).hexdigest ()} "
70
+ raise ValueError (f"Unsupported algorithm: { algorithm } " )
38
71
39
- def _create_key_encoder (namespace : str ) -> Callable [[str ], str ]:
40
- """Create an encoder for a key."""
41
- return partial (_key_encoder , namespace = namespace )
72
+ return _key_encoder
42
73
43
74
44
75
def _value_serializer (value : Sequence [float ]) -> bytes :
@@ -51,6 +82,28 @@ def _value_deserializer(serialized_value: bytes) -> list[float]:
51
82
return cast (list [float ], json .loads (serialized_value .decode ()))
52
83
53
84
85
+ # The warning is global; track emission, so it appears only once.
86
+ _warned_about_sha1 : bool = False
87
+
88
+
89
+ def _warn_about_sha1_encoder () -> None :
90
+ """Emit a one‑time warning about SHA‑1 collision weaknesses."""
91
+ global _warned_about_sha1
92
+ if not _warned_about_sha1 :
93
+ warnings .warn (
94
+ "Using default key encoder: SHA‑1 is *not* collision‑resistant. "
95
+ "While acceptable for most cache scenarios, a motivated attacker "
96
+ "can craft two different payloads that map to the same cache key. "
97
+ "If that risk matters in your environment, supply a stronger "
98
+ "encoder (e.g. SHA‑256 or BLAKE2) via the `key_encoder` argument. "
99
+ "If you change the key encoder, consider also creating a new cache, "
100
+ "to avoid (the potential for) collisions with existing keys." ,
101
+ category = UserWarning ,
102
+ stacklevel = 2 ,
103
+ )
104
+ _warned_about_sha1 = True
105
+
106
+
54
107
class CacheBackedEmbeddings (Embeddings ):
55
108
"""Interface for caching results from embedding models.
56
109
@@ -234,6 +287,9 @@ def from_bytes_store(
234
287
namespace : str = "" ,
235
288
batch_size : Optional [int ] = None ,
236
289
query_embedding_cache : Union [bool , ByteStore ] = False ,
290
+ key_encoder : Union [
291
+ Callable [[str ], str ], Literal ["sha1" , "blake2b" , "sha256" , "sha512" ]
292
+ ] = "sha1" ,
237
293
) -> CacheBackedEmbeddings :
238
294
"""On-ramp that adds the necessary serialization and encoding to the store.
239
295
@@ -248,9 +304,39 @@ def from_bytes_store(
248
304
query_embedding_cache: The cache to use for storing query embeddings.
249
305
True to use the same cache as document embeddings.
250
306
False to not cache query embeddings.
307
+ key_encoder: Optional callable to encode keys. If not provided,
308
+ a default encoder using SHA‑1 will be used. SHA-1 is not
309
+ collision-resistant, and a motivated attacker could craft two
310
+ different texts that hash to the same cache key.
311
+
312
+ New applications should use one of the alternative encoders
313
+ or provide a custom and strong key encoder function to avoid this risk.
314
+
315
+ If you change a key encoder in an existing cache, consider
316
+ just creating a new cache, to avoid (the potential for)
317
+ collisions with existing keys or having duplicate keys
318
+ for the same text in the cache.
319
+
320
+ Returns:
321
+ An instance of CacheBackedEmbeddings that uses the provided cache.
251
322
"""
252
- namespace = namespace
253
- key_encoder = _create_key_encoder (namespace )
323
+ if isinstance (key_encoder , str ):
324
+ key_encoder = _make_default_key_encoder (namespace , key_encoder )
325
+ elif callable (key_encoder ):
326
+ # If a custom key encoder is provided, it should not be used with a
327
+ # namespace.
328
+ # A user can handle namespacing in directly their custom key encoder.
329
+ if namespace :
330
+ raise ValueError (
331
+ "Do not supply `namespace` when using a custom key_encoder; "
332
+ "add any prefixing inside the encoder itself."
333
+ )
334
+ else :
335
+ raise ValueError (
336
+ "key_encoder must be either 'blake2b', 'sha1', 'sha256', 'sha512' "
337
+ "or a callable that encodes keys."
338
+ )
339
+
254
340
document_embedding_store = EncoderBackedStore [str , list [float ]](
255
341
document_embedding_cache ,
256
342
key_encoder ,
0 commit comments