Skip to content

Commit eaccd55

Browse files
[FIX] Support for text extraction independent for indexing status. (#141)
* Exception handling for Prompt Service * Fix/handling extraction for duplicate documents * Adding validation before extraction
1 parent 7758532 commit eaccd55

File tree

1 file changed

+16
-3
lines changed

1 file changed

+16
-3
lines changed

src/unstract/sdk/index.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ def query_index(
7373

7474
try:
7575
self.tool.stream_log(
76-
f">>> Querying '{vector_db_instance_id}' for {doc_id}..."
77-
)
76+
f">>> Querying '{vector_db_instance_id}' for {doc_id}..."
77+
)
7878
try:
7979
doc_id_eq_filter = MetadataFilter.from_dict(
8080
{
@@ -287,6 +287,20 @@ def index(
287287

288288
if doc_id_found and not reindex:
289289
self.tool.stream_log(f"File was indexed already under {doc_id}")
290+
291+
if not fs.exists(output_file_path):
292+
# Added this as a workaround to handle extraction
293+
# for documents uploaded twice in different projects.
294+
# to be reconsidered after permanent fixes.
295+
extracted_text = self.extract_text(
296+
x2text_instance_id=x2text_instance_id,
297+
file_path=file_path,
298+
output_file_path=output_file_path,
299+
enable_highlight=enable_highlight,
300+
usage_kwargs=usage_kwargs,
301+
process_text=process_text,
302+
fs=fs,
303+
)
290304
return doc_id
291305

292306
extracted_text = self.extract_text(
@@ -298,7 +312,6 @@ def index(
298312
process_text=process_text,
299313
fs=fs,
300314
)
301-
302315
if not extracted_text:
303316
raise IndexingError("No text available to index")
304317

0 commit comments

Comments
 (0)