[FIX] Refactored indexing, extraction and retrieval (#172)

harini-venkataraman · web-flow · commit fc0fe4ff3fa4 · 2025-03-19T12:07:53.000+05:30
* Exception handling for Prompt Service

* refactor: Indexing API segregation

* refactor: Indexing API segregation

* refactor: Indexing API segregation

* Retrievers - Subquestion &amp; Simple

* Addressing review comments

* Addressing review comments

* Moving helpers to application

* Remove unused exceptions

Signed-off-by: harini-venkataraman &lt;115449948+harini-venkataraman@users.noreply.github.com&gt;

* Adding Index util to generate index key

* Version bump

* Version bump

* Adding headers for API

---------

Signed-off-by: harini-venkataraman &lt;115449948+harini-venkataraman@users.noreply.github.com&gt;
diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.60.1"
+__version__ = "0.61.0"
 
 
 def get_sdk_version():
diff --git a/src/unstract/sdk/prompt.py b/src/unstract/sdk/prompt.py
@@ -47,6 +47,40 @@ def answer_prompt(
         return self._post_call(
             url_path=url_path, payload=payload, params=params, headers=headers
         )
+    
+    @log_elapsed(operation="INDEX")
+    def index(
+        self, 
+        payload: dict[str, Any], 
+        params: Optional[dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
+    ) -> dict[str, Any]:
+        url_path = "index"
+        if self.is_public_call:
+            url_path = "index-public"
+        return self._post_call(
+            url_path=url_path,
+            payload=payload,
+            params=params,
+            headers=headers,
+        )
+    
+    @log_elapsed(operation="EXTRACT")
+    def extract(
+        self, 
+        payload: dict[str, Any], 
+        params: Optional[dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
+    ) -> dict[str, Any]:
+        url_path = "extract"
+        if self.is_public_call:
+            url_path = "extract-public"
+        return self._post_call(
+            url_path=url_path,
+            payload=payload,
+            params=params,
+            headers=headers,
+        )
 
     def single_pass_extraction(
         self,
diff --git a/src/unstract/sdk/utils/indexing_utils.py b/src/unstract/sdk/utils/indexing_utils.py
@@ -0,0 +1,54 @@
+import json
+from typing import Optional
+
+from unstract.sdk.adapter import ToolAdapter
+from unstract.sdk.file_storage import FileStorage, FileStorageProvider
+from unstract.sdk.tool.base import BaseTool
+from unstract.sdk.utils import ToolUtils
+
+
+class IndexingUtils:
+    @staticmethod
+    def generate_index_key(
+        vector_db: str,
+        embedding: str,
+        x2text: str,
+        chunk_size: str,
+        chunk_overlap: str,
+        tool: BaseTool,
+        file_path: Optional[str] = None,
+        file_hash: Optional[str] = None,
+        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
+    ) -> str:
+        """Generates a unique index key based on the provided configuration,
+        file information, instance identifiers, and processing options.
+
+        Args:
+            fs (FileStorage, optional): File storage for remote storage.
+
+        Returns:
+            str: A unique index key used for indexing the document.
+        """
+        if not file_path and not file_hash:
+            raise ValueError("One of `file_path` or `file_hash` need to be provided")
+
+        if not file_hash:
+            file_hash = fs.get_hash_from_file(path=file_path)
+
+        # Whole adapter config is used currently even though it contains some keys
+        # which might not be relevant to indexing. This is easier for now than
+        # marking certain keys of the adapter config as necessary.
+        index_key = {
+            "file_hash": file_hash,
+            "vector_db_config": ToolAdapter.get_adapter_config(tool, vector_db),
+            "embedding_config": ToolAdapter.get_adapter_config(tool, embedding),
+            "x2text_config": ToolAdapter.get_adapter_config(tool, x2text),
+            # Typed and hashed as strings since the final hash is persisted
+            # and this is required to be backward compatible
+            "chunk_size": str(chunk_size),
+            "chunk_overlap": str(chunk_overlap),
+        }
+        # JSON keys are sorted to ensure that the same key gets hashed even in
+        # case where the fields are reordered.
+        hashed_index_key = ToolUtils.hash_str(json.dumps(index_key, sort_keys=True))
+        return hashed_index_key

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.60.1"`
	`1`	`+__version__ = "0.61.0"`
`2`	`2`
`3`	`3`
`4`	`4`	`def get_sdk_version():`