Skip to content

Commit 49635b1

Browse files
fix: Indexing related improvements and fixes (#25)
Indexing related improvements and fixes Signed-off-by: Chandrasekharan M <[email protected]>
1 parent 5bfddb8 commit 49635b1

File tree

3 files changed

+107
-85
lines changed

3 files changed

+107
-85
lines changed

src/unstract/sdk/exceptions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,9 @@ def __init__(self, message: str = DEFAULT_MESSAGE):
88

99
def __str__(self) -> str:
1010
return self.message
11+
12+
13+
class IndexingError(SdkError):
14+
def __init__(self, message: str = ""):
15+
prefix = "Error with indexing. "
16+
super().__init__(prefix + message)

src/unstract/sdk/index.py

Lines changed: 95 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,19 @@
22

33
from llama_index import Document, StorageContext, VectorStoreIndex
44
from llama_index.node_parser import SimpleNodeParser
5-
from llama_index.vector_stores import VectorStoreQuery, VectorStoreQueryResult
5+
from llama_index.vector_stores import (
6+
FilterOperator,
7+
MetadataFilter,
8+
MetadataFilters,
9+
VectorStoreQuery,
10+
VectorStoreQueryResult,
11+
)
612
from unstract.adapters.exceptions import AdapterError
713
from unstract.adapters.x2text.x2text_adapter import X2TextAdapter
814

915
from unstract.sdk.constants import LogLevel, ToolEnv
1016
from unstract.sdk.embedding import ToolEmbedding
11-
from unstract.sdk.exceptions import SdkError
17+
from unstract.sdk.exceptions import IndexingError, SdkError
1218
from unstract.sdk.tool.base import BaseTool
1319
from unstract.sdk.utils import ToolUtils
1420
from unstract.sdk.utils.service_context import ServiceContext
@@ -172,18 +178,21 @@ def index_file(
172178
)
173179
raise SdkError(f"Error loading {vector_db}")
174180

175-
filter = [{"field_name": "doc_id", "operator": "=", "value": doc_id}]
181+
doc_id_eq_filter = MetadataFilter.from_dict(
182+
{"key": "doc_id", "operator": FilterOperator.EQ, "value": doc_id}
183+
)
184+
filters = MetadataFilters(filters=[doc_id_eq_filter])
176185
q = VectorStoreQuery(
177186
query_embedding=embedding_li.get_query_embedding(" "),
178187
doc_ids=[doc_id],
179-
filters=filter,
188+
filters=filters,
180189
)
181190

182-
doc_id_not_found = True
191+
doc_id_found = False
183192
try:
184193
n: VectorStoreQueryResult = vector_db_li.query(query=q)
185194
if len(n.nodes) > 0:
186-
doc_id_not_found = False
195+
doc_id_found = True
187196
self.tool.stream_log(f"Found {len(n.nodes)} nodes for {doc_id}")
188197
else:
189198
self.tool.stream_log(f"No nodes found for {doc_id}")
@@ -192,7 +201,7 @@ def index_file(
192201
f"Error querying {vector_db}: {e}", level=LogLevel.ERROR
193202
)
194203

195-
if doc_id_not_found is False and reindex:
204+
if doc_id_found and reindex:
196205
# Delete the nodes for the doc_id
197206
try:
198207
vector_db_li.delete(ref_doc_id=doc_id)
@@ -203,87 +212,91 @@ def index_file(
203212
level=LogLevel.ERROR,
204213
)
205214
raise SdkError(f"Error deleting nodes for {doc_id}: {e}")
206-
doc_id_not_found = True
215+
doc_id_found = False
207216

208-
if doc_id_not_found:
209-
self.tool.stream_log("Extracting text from input file")
210-
full_text = []
211-
extracted_text = ""
212-
try:
213-
x2text = X2Text(tool=self.tool)
214-
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
215-
adapter_instance_id=x2text_adapter
216-
)
217-
extracted_text = x2text_adapter_inst.process(
218-
input_file_path=file_path, output_file_path=output_file_path
219-
)
220-
except AdapterError as e:
221-
# Wrapping AdapterErrors with SdkError
222-
raise SdkError(str(e)) from e
223-
full_text.append(
224-
{
225-
"section": "full",
226-
"text_contents": self._cleanup_text(extracted_text),
227-
}
217+
if doc_id_found:
218+
self.tool.stream_log(f"File was indexed already under {doc_id}")
219+
return doc_id
220+
221+
# Extract text and index
222+
self.tool.stream_log("Extracting text from input file")
223+
full_text = []
224+
extracted_text = ""
225+
try:
226+
x2text = X2Text(tool=self.tool)
227+
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
228+
adapter_instance_id=x2text_adapter
229+
)
230+
extracted_text = x2text_adapter_inst.process(
231+
input_file_path=file_path, output_file_path=output_file_path
228232
)
233+
except AdapterError as e:
234+
# Wrapping AdapterErrors with SdkError
235+
raise IndexingError(str(e)) from e
236+
full_text.append(
237+
{
238+
"section": "full",
239+
"text_contents": self._cleanup_text(extracted_text),
240+
}
241+
)
229242

230-
# Check if chunking is required
231-
documents = []
232-
for item in full_text:
233-
text = item["text_contents"]
234-
self.tool.stream_log("Indexing file...")
235-
document = Document(
236-
text=text,
237-
doc_id=doc_id,
238-
metadata={"section": item["section"]},
239-
)
240-
document.id_ = doc_id
241-
documents.append(document)
242-
self.tool.stream_log(f"Number of documents: {len(documents)}")
243-
if chunk_size == 0:
244-
parser = SimpleNodeParser.from_defaults(
245-
chunk_size=len(documents[0].text) + 10, chunk_overlap=0
246-
)
247-
nodes = parser.get_nodes_from_documents(
248-
documents, show_progress=True
249-
)
250-
node = nodes[0]
251-
node.embedding = embedding_li.get_query_embedding(" ")
252-
vector_db_li.add(nodes=[node])
253-
self.tool.stream_log("Added node to vector db")
254-
else:
255-
storage_context = StorageContext.from_defaults(
256-
vector_store=vector_db_li
257-
)
258-
parser = SimpleNodeParser.from_defaults(
259-
chunk_size=chunk_size, chunk_overlap=chunk_overlap
260-
)
243+
# Check if chunking is required
244+
documents = []
245+
for item in full_text:
246+
text = item["text_contents"]
247+
self.tool.stream_log("Indexing file...")
248+
document = Document(
249+
text=text,
250+
doc_id=doc_id,
251+
metadata={"section": item["section"]},
252+
)
253+
document.id_ = doc_id
254+
documents.append(document)
255+
self.tool.stream_log(f"Number of documents: {len(documents)}")
256+
if chunk_size == 0:
257+
parser = SimpleNodeParser.from_defaults(
258+
chunk_size=len(documents[0].text) + 10, chunk_overlap=0
259+
)
260+
nodes = parser.get_nodes_from_documents(
261+
documents, show_progress=True
262+
)
263+
node = nodes[0]
264+
node.embedding = embedding_li.get_query_embedding(" ")
265+
vector_db_li.add(nodes=[node])
266+
self.tool.stream_log("Added node to vector db")
267+
else:
268+
storage_context = StorageContext.from_defaults(
269+
vector_store=vector_db_li
270+
)
271+
parser = SimpleNodeParser.from_defaults(
272+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
273+
)
261274

262-
service_context = ServiceContext.get_service_context(
263-
platform_api_key=self.tool.get_env_or_die(
264-
ToolEnv.PLATFORM_API_KEY
265-
),
266-
embed_model=embedding_li,
267-
node_parser=parser,
268-
)
275+
service_context = ServiceContext.get_service_context(
276+
platform_api_key=self.tool.get_env_or_die(
277+
ToolEnv.PLATFORM_API_KEY
278+
),
279+
embed_model=embedding_li,
280+
node_parser=parser,
281+
)
269282

270-
self.tool.stream_log("Adding nodes to vector db...")
271-
try:
272-
VectorStoreIndex.from_documents(
273-
documents,
274-
storage_context=storage_context,
275-
show_progress=True,
276-
service_context=service_context,
277-
)
278-
except Exception as e:
279-
self.tool.stream_log(
280-
f"Error adding nodes to vector db: {e}",
281-
level=LogLevel.ERROR,
282-
)
283-
raise SdkError(f"Error adding nodes to vector db: {e}")
284-
self.tool.stream_log("Added nodes to vector db")
283+
self.tool.stream_log("Adding nodes to vector db...")
284+
try:
285+
VectorStoreIndex.from_documents(
286+
documents,
287+
storage_context=storage_context,
288+
show_progress=True,
289+
service_context=service_context,
290+
)
291+
except Exception as e:
292+
self.tool.stream_log(
293+
f"Error adding nodes to vector db: {e}",
294+
level=LogLevel.ERROR,
295+
)
296+
raise SdkError(f"Error adding nodes to vector db: {e}")
297+
self.tool.stream_log("Added nodes to vector db")
285298

286-
self.tool.stream_log("File has been indexed successfully")
299+
self.tool.stream_log("File has been indexed successfully")
287300
return doc_id
288301

289302
@staticmethod

src/unstract/sdk/tool/executor.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from pathlib import Path
55
from typing import Any
66

7+
from unstract.adapters import get_adapter_version
8+
79
from unstract.sdk import get_sdk_version
810
from unstract.sdk.constants import Command
911
from unstract.sdk.tool.base import BaseTool
@@ -55,9 +57,10 @@ def execute_run(self, args: argparse.Namespace) -> None:
5557

5658
self.tool.stream_log(
5759
f"Running tool for "
58-
f"Workflow ID: {self.tool.workflow_id} "
59-
f"Execution ID: {self.tool.execution_id} "
60-
f"SDK Version: {get_sdk_version()}"
60+
f"Workflow ID: {self.tool.workflow_id}, "
61+
f"Execution ID: {self.tool.execution_id}, "
62+
f"SDK Version: {get_sdk_version()}, "
63+
f"adapter Version: {get_adapter_version()}"
6164
)
6265
self.tool.run(
6366
settings=settings,

0 commit comments

Comments
 (0)