Skip to content

Commit 14862e9

Browse files
feat: Index key generation fix (#42)
* Index key generation fix, file hash optimized, dead code removed, version bumped to 0.23.0 * Update src/unstract/sdk/llm.py Co-authored-by: Ritwik G <[email protected]> Signed-off-by: Chandrasekharan M <[email protected]> * Added docstrings, minor PR comments addressed * Version bumped to 0.24.0 Signed-off-by: Chandrasekharan M <[email protected]> * Addressed PR comments, sort JSON keys and avoid requiring file_hash for key generation * Minor fix, added comments --------- Signed-off-by: Chandrasekharan M <[email protected]> Co-authored-by: Ritwik G <[email protected]>
1 parent b78f41b commit 14862e9

File tree

10 files changed

+160
-212
lines changed

10 files changed

+160
-212
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ repos:
5050
rev: 24.2.0
5151
hooks:
5252
- id: black
53-
args: [--config=pyproject.toml, -l 80]
53+
args: [--config=pyproject.toml, -l 88]
5454
language: system
5555
exclude: |
5656
(?x)^(
@@ -60,7 +60,7 @@ repos:
6060
rev: 7.0.0
6161
hooks:
6262
- id: flake8
63-
args: [--max-line-length=80]
63+
args: [--max-line-length=88]
6464
exclude: |
6565
(?x)^(
6666
.*migrations/.*\.py|

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ lint = [
5353
]
5454

5555
[tool.isort]
56-
line_length = 80
56+
line_length = 88
5757
multi_line_output = 3
5858
include_trailing_comma = true
5959
force_grid_wrap = 0

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.23.0"
1+
__version__ = "0.24.0"
22

33

44
def get_sdk_version():

src/unstract/sdk/constants.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,3 @@ class ToolSettingsKey:
146146
EMBEDDING_ADAPTER_ID = "embeddingAdapterId"
147147
VECTOR_DB_ADAPTER_ID = "vectorDbAdapterId"
148148
X2TEXT_ADAPTER_ID = "x2TextAdapterId"
149-
150-
151-
class FileReaderSettings:
152-
FILE_READER_CHUNK_SIZE = 8192

src/unstract/sdk/embedding.py

Lines changed: 21 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,49 @@
1-
from typing import Optional
2-
31
from llama_index.core.embeddings import BaseEmbedding
42
from unstract.adapters.constants import Common
53
from unstract.adapters.embedding import adapters
64

75
from unstract.sdk.adapters import ToolAdapter
8-
from unstract.sdk.constants import LogLevel, ToolSettingsKey
6+
from unstract.sdk.constants import LogLevel
97
from unstract.sdk.exceptions import SdkError
108
from unstract.sdk.tool.base import BaseTool
119

1210

1311
class ToolEmbedding:
1412
__TEST_SNIPPET = "Hello, I am Unstract"
1513

16-
def __init__(self, tool: BaseTool, tool_settings: dict[str, str] = {}):
14+
def __init__(self, tool: BaseTool):
1715
self.tool = tool
1816
self.max_tokens = 1024 * 16
1917
self.embedding_adapters = adapters
20-
self.embedding_adapter_instance_id = tool_settings.get(
21-
ToolSettingsKey.EMBEDDING_ADAPTER_ID
22-
)
23-
self.embedding_adapter_id: Optional[str] = None
2418

25-
def get_embedding(
26-
self, adapter_instance_id: Optional[str] = None
27-
) -> BaseEmbedding:
28-
adapter_instance_id = (
29-
adapter_instance_id
30-
if adapter_instance_id
31-
else self.embedding_adapter_instance_id
32-
)
33-
if not adapter_instance_id:
34-
raise SdkError(
35-
f"Adapter_instance_id does not have "
36-
f"a valid value: {adapter_instance_id}"
37-
)
19+
def get_embedding(self, adapter_instance_id: str) -> BaseEmbedding:
20+
"""Gets an instance of LlamaIndex's embedding object.
21+
22+
Args:
23+
adapter_instance_id (str): UUID of the embedding adapter
24+
25+
Returns:
26+
BaseEmbedding: Embedding instance
27+
"""
3828
try:
3929
embedding_config_data = ToolAdapter.get_adapter_config(
4030
self.tool, adapter_instance_id
4131
)
4232
embedding_adapter_id = embedding_config_data.get(Common.ADAPTER_ID)
43-
self.embedding_adapter_id = embedding_adapter_id
44-
if embedding_adapter_id in self.embedding_adapters:
45-
embedding_adapter = self.embedding_adapters[
46-
embedding_adapter_id
47-
][Common.METADATA][Common.ADAPTER]
48-
embedding_metadata = embedding_config_data.get(
49-
Common.ADAPTER_METADATA
50-
)
51-
embedding_adapter_class = embedding_adapter(embedding_metadata)
52-
return embedding_adapter_class.get_embedding_instance()
53-
else:
33+
if embedding_adapter_id not in self.embedding_adapters:
5434
raise SdkError(
5535
f"Embedding adapter not supported : "
5636
f"{embedding_adapter_id}"
5737
)
38+
39+
embedding_adapter = self.embedding_adapters[embedding_adapter_id][
40+
Common.METADATA
41+
][Common.ADAPTER]
42+
embedding_metadata = embedding_config_data.get(
43+
Common.ADAPTER_METADATA
44+
)
45+
embedding_adapter_class = embedding_adapter(embedding_metadata)
46+
return embedding_adapter_class.get_embedding_instance()
5847
except Exception as e:
5948
self.tool.stream_log(
6049
log=f"Error getting embedding: {e}", level=LogLevel.ERROR

src/unstract/sdk/index.py

Lines changed: 50 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from typing import Optional
23

34
from llama_index.core import Document
@@ -14,14 +15,13 @@
1415
from unstract.adapters.exceptions import AdapterError
1516
from unstract.adapters.x2text.x2text_adapter import X2TextAdapter
1617

18+
from unstract.sdk.adapters import ToolAdapter
1719
from unstract.sdk.constants import LogLevel, ToolEnv
1820
from unstract.sdk.embedding import ToolEmbedding
1921
from unstract.sdk.exceptions import IndexingError, SdkError
2022
from unstract.sdk.tool.base import BaseTool
2123
from unstract.sdk.utils import ToolUtils
22-
from unstract.sdk.utils.callback_manager import (
23-
CallbackManager as UNCallbackManager,
24-
)
24+
from unstract.sdk.utils.callback_manager import CallbackManager as UNCallbackManager
2525
from unstract.sdk.vector_db import ToolVectorDB
2626
from unstract.sdk.x2txt import X2Text
2727

@@ -31,18 +31,9 @@ def __init__(self, tool: BaseTool):
3131
# TODO: Inherit from StreamMixin and avoid using BaseTool
3232
self.tool = tool
3333

34-
def get_text_from_index(
35-
self, embedding_type: str, vector_db: str, doc_id: str
36-
):
34+
def get_text_from_index(self, embedding_type: str, vector_db: str, doc_id: str):
3735
embedd_helper = ToolEmbedding(tool=self.tool)
38-
embedding_li = embedd_helper.get_embedding(
39-
adapter_instance_id=embedding_type
40-
)
41-
if embedding_li is None:
42-
self.tool.stream_log(
43-
f"Error loading {embedding_type}", level=LogLevel.ERROR
44-
)
45-
raise SdkError(f"Error loading {embedding_type}")
36+
embedding_li = embedd_helper.get_embedding(adapter_instance_id=embedding_type)
4637
embedding_dimension = embedd_helper.get_embedding_length(embedding_li)
4738

4839
vdb_helper = ToolVectorDB(
@@ -53,12 +44,6 @@ def get_text_from_index(
5344
embedding_dimension=embedding_dimension,
5445
)
5546

56-
if vector_db_li is None:
57-
self.tool.stream_log(
58-
f"Error loading {vector_db}", level=LogLevel.ERROR
59-
)
60-
raise SdkError(f"Error loading {vector_db}")
61-
6247
try:
6348
self.tool.stream_log(f">>> Querying {vector_db}...")
6449
self.tool.stream_log(f">>> {doc_id}")
@@ -149,48 +134,33 @@ def index_file(
149134
Returns:
150135
str: A unique ID for the file and indexing arguments combination
151136
"""
152-
# Make file content hash if not available
153-
if not file_hash:
154-
file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
155-
156-
doc_id = ToolIndex.generate_file_id(
137+
doc_id = self.generate_file_id(
157138
tool_id=tool_id,
158-
file_hash=file_hash,
159139
vector_db=vector_db,
160140
embedding=embedding_type,
161141
x2text=x2text_adapter,
162-
chunk_size=chunk_size,
163-
chunk_overlap=chunk_overlap,
142+
chunk_size=str(chunk_size),
143+
chunk_overlap=str(chunk_overlap),
144+
file_path=file_path,
145+
file_hash=file_hash,
164146
)
165-
166147
self.tool.stream_log(f"Checking if doc_id {doc_id} exists")
167148

168-
vdb_helper = ToolVectorDB(
169-
tool=self.tool,
170-
)
171-
149+
# Get embedding instance
172150
embedd_helper = ToolEmbedding(tool=self.tool)
151+
embedding_li = embedd_helper.get_embedding(adapter_instance_id=embedding_type)
152+
embedding_dimension = embedd_helper.get_embedding_length(embedding_li)
173153

174-
embedding_li = embedd_helper.get_embedding(
175-
adapter_instance_id=embedding_type
154+
# Get vectorDB instance
155+
vdb_helper = ToolVectorDB(
156+
tool=self.tool,
176157
)
177-
if embedding_li is None:
178-
self.tool.stream_log(
179-
f"Error loading {embedding_type}", level=LogLevel.ERROR
180-
)
181-
raise SdkError(f"Error loading {embedding_type}")
182-
183-
embedding_dimension = embedd_helper.get_embedding_length(embedding_li)
184158
vector_db_li = vdb_helper.get_vector_db(
185159
adapter_instance_id=vector_db,
186160
embedding_dimension=embedding_dimension,
187161
)
188-
if vector_db_li is None:
189-
self.tool.stream_log(
190-
f"Error loading {vector_db}", level=LogLevel.ERROR
191-
)
192-
raise SdkError(f"Error loading {vector_db}")
193162

163+
# Checking if document is already indexed against doc_id
194164
doc_id_eq_filter = MetadataFilter.from_dict(
195165
{"key": "doc_id", "operator": FilterOperator.EQ, "value": doc_id}
196166
)
@@ -275,26 +245,20 @@ def index_file(
275245
parser = SimpleNodeParser.from_defaults(
276246
chunk_size=len(documents[0].text) + 10, chunk_overlap=0
277247
)
278-
nodes = parser.get_nodes_from_documents(
279-
documents, show_progress=True
280-
)
248+
nodes = parser.get_nodes_from_documents(documents, show_progress=True)
281249
node = nodes[0]
282250
node.embedding = embedding_li.get_query_embedding(" ")
283251
vector_db_li.add(nodes=[node])
284252
self.tool.stream_log("Added node to vector db")
285253
else:
286-
storage_context = StorageContext.from_defaults(
287-
vector_store=vector_db_li
288-
)
254+
storage_context = StorageContext.from_defaults(vector_store=vector_db_li)
289255
parser = SimpleNodeParser.from_defaults(
290256
chunk_size=chunk_size, chunk_overlap=chunk_overlap
291257
)
292258

293259
# Set callback_manager to collect Usage stats
294260
callback_manager = UNCallbackManager.set_callback_manager(
295-
platform_api_key=self.tool.get_env_or_die(
296-
ToolEnv.PLATFORM_API_KEY
297-
),
261+
platform_api_key=self.tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY),
298262
embedding=embedding_li,
299263
)
300264

@@ -319,31 +283,53 @@ def index_file(
319283
self.tool.stream_log("File has been indexed successfully")
320284
return doc_id
321285

322-
@staticmethod
323286
def generate_file_id(
287+
self,
324288
tool_id: str,
325-
file_hash: str,
326289
vector_db: str,
327290
embedding: str,
328291
x2text: str,
329292
chunk_size: str,
330293
chunk_overlap: str,
294+
file_path: Optional[str] = None,
295+
file_hash: Optional[str] = None,
331296
) -> str:
332297
"""Generates a unique ID useful for identifying files during indexing.
333298
334299
Args:
335-
tool_id (str): Unique ID of the tool developed / exported
336-
file_hash (str): Hash of the file contents
300+
tool_id (str): Unique ID of the tool or workflow
337301
vector_db (str): UUID of the vector DB adapter
338302
embedding (str): UUID of the embedding adapter
339303
x2text (str): UUID of the X2Text adapter
340304
chunk_size (str): Chunk size for indexing
341305
chunk_overlap (str): Chunk overlap for indexing
306+
file_path (Optional[str]): Path to the file that needs to be indexed.
307+
Defaults to None. One of file_path or file_hash needs to be specified.
308+
file_hash (Optional[str], optional): SHA256 hash of the file.
309+
Defaults to None. If None, the hash is generated with file_path.
342310
343311
Returns:
344312
str: Key representing unique ID for a file
345313
"""
346-
return (
347-
f"{tool_id}|{vector_db}|{embedding}|{x2text}|"
348-
f"{chunk_size}|{chunk_overlap}|{file_hash}"
349-
)
314+
if not file_path and not file_hash:
315+
raise ValueError("One of `file_path` or `file_hash` need to be provided")
316+
317+
if not file_hash:
318+
file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
319+
320+
# Whole adapter config is used currently even though it contains some keys
321+
# which might not be relevant to indexing. This is easier for now than
322+
# marking certain keys of the adapter config as necessary.
323+
index_key = {
324+
"tool_id": tool_id,
325+
"file_hash": file_hash,
326+
"vector_db_config": ToolAdapter.get_adapter_config(self.tool, vector_db),
327+
"embedding_config": ToolAdapter.get_adapter_config(self.tool, embedding),
328+
"x2text_config": ToolAdapter.get_adapter_config(self.tool, x2text),
329+
"chunk_size": chunk_size,
330+
"chunk_overlap": chunk_overlap,
331+
}
332+
# JSON keys are sorted to ensure that the same key gets hashed even in
333+
# case where the fields are reordered.
334+
hashed_index_key = ToolUtils.hash_str(json.dumps(index_key, sort_keys=True))
335+
return hashed_index_key

0 commit comments

Comments
 (0)