[FIX] Support for text extraction independent for indexing status. (#141)

harini-venkataraman · web-flow · commit eaccd551946e · 2025-01-08T10:36:41.000+05:30
* Exception handling for Prompt Service

* Fix/handling extraction for duplicate documents

* Adding validation before extraction
diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py
@@ -73,8 +73,8 @@ def query_index(
 
         try:
             self.tool.stream_log(
-                    f">>> Querying '{vector_db_instance_id}' for {doc_id}..."
-                )
+                f">>> Querying '{vector_db_instance_id}' for {doc_id}..."
+            )
             try:
                 doc_id_eq_filter = MetadataFilter.from_dict(
                     {
@@ -287,6 +287,20 @@ def index(
 
             if doc_id_found and not reindex:
                 self.tool.stream_log(f"File was indexed already under {doc_id}")
+
+                if not fs.exists(output_file_path):
+                    # Added this as a workaround to handle extraction
+                    # for documents uploaded twice in different projects.
+                    # to be reconsidered after permanent fixes.
+                    extracted_text = self.extract_text(
+                        x2text_instance_id=x2text_instance_id,
+                        file_path=file_path,
+                        output_file_path=output_file_path,
+                        enable_highlight=enable_highlight,
+                        usage_kwargs=usage_kwargs,
+                        process_text=process_text,
+                        fs=fs,
+                    )
                 return doc_id
 
             extracted_text = self.extract_text(
@@ -298,7 +312,6 @@ def index(
                 process_text=process_text,
                 fs=fs,
             )
-
             if not extracted_text:
                 raise IndexingError("No text available to index")