Skip to content

Commit 7048dbd

Browse files
fix: Type chunk size and overlap during index key generation (#45)
Fixed index key generation and typing chunk size / overlap before hashing
1 parent 14862e9 commit 7048dbd

File tree

2 files changed

+8
-4
lines changed

2 files changed

+8
-4
lines changed

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.24.0"
1+
__version__ = "0.24.1"
22

33

44
def get_sdk_version():

src/unstract/sdk/index.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ def __init__(self, tool: BaseTool):
3131
# TODO: Inherit from StreamMixin and avoid using BaseTool
3232
self.tool = tool
3333

34-
def get_text_from_index(self, embedding_type: str, vector_db: str, doc_id: str):
34+
def get_text_from_index(
35+
self, embedding_type: str, vector_db: str, doc_id: str
36+
) -> Optional[str]:
3537
embedd_helper = ToolEmbedding(tool=self.tool)
3638
embedding_li = embedd_helper.get_embedding(adapter_instance_id=embedding_type)
3739
embedding_dimension = embedd_helper.get_embedding_length(embedding_li)
@@ -326,8 +328,10 @@ def generate_file_id(
326328
"vector_db_config": ToolAdapter.get_adapter_config(self.tool, vector_db),
327329
"embedding_config": ToolAdapter.get_adapter_config(self.tool, embedding),
328330
"x2text_config": ToolAdapter.get_adapter_config(self.tool, x2text),
329-
"chunk_size": chunk_size,
330-
"chunk_overlap": chunk_overlap,
331+
# Typed and hashed as strings since the final hash is persisted
332+
# and this is required to be backward compatible
333+
"chunk_size": str(chunk_size),
334+
"chunk_overlap": str(chunk_overlap),
331335
}
332336
# JSON keys are sorted to ensure that the same key gets hashed even in
333337
# case where the fields are reordered.

0 commit comments

Comments
 (0)