Skip to content

Commit fc0fe4f

Browse files
[FIX] Refactored indexing, extraction and retrieval (#172)
* Exception handling for Prompt Service * refactor: Indexing API segregation * refactor: Indexing API segregation * refactor: Indexing API segregation * Retrievers - Subquestion & Simple * Addressing review comments * Addressing review comments * Moving helpers to application * Remove unused exceptions Signed-off-by: harini-venkataraman <[email protected]> * Adding Index util to generate index key * Version bump * Version bump * Adding headers for API --------- Signed-off-by: harini-venkataraman <[email protected]>
1 parent 8d1e39a commit fc0fe4f

File tree

3 files changed

+89
-1
lines changed

3 files changed

+89
-1
lines changed

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.60.1"
1+
__version__ = "0.61.0"
22

33

44
def get_sdk_version():

src/unstract/sdk/prompt.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,40 @@ def answer_prompt(
4747
return self._post_call(
4848
url_path=url_path, payload=payload, params=params, headers=headers
4949
)
50+
51+
@log_elapsed(operation="INDEX")
52+
def index(
53+
self,
54+
payload: dict[str, Any],
55+
params: Optional[dict[str, str]] = None,
56+
headers: Optional[dict[str, str]] = None,
57+
) -> dict[str, Any]:
58+
url_path = "index"
59+
if self.is_public_call:
60+
url_path = "index-public"
61+
return self._post_call(
62+
url_path=url_path,
63+
payload=payload,
64+
params=params,
65+
headers=headers,
66+
)
67+
68+
@log_elapsed(operation="EXTRACT")
69+
def extract(
70+
self,
71+
payload: dict[str, Any],
72+
params: Optional[dict[str, str]] = None,
73+
headers: Optional[dict[str, str]] = None,
74+
) -> dict[str, Any]:
75+
url_path = "extract"
76+
if self.is_public_call:
77+
url_path = "extract-public"
78+
return self._post_call(
79+
url_path=url_path,
80+
payload=payload,
81+
params=params,
82+
headers=headers,
83+
)
5084

5185
def single_pass_extraction(
5286
self,
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import json
2+
from typing import Optional
3+
4+
from unstract.sdk.adapter import ToolAdapter
5+
from unstract.sdk.file_storage import FileStorage, FileStorageProvider
6+
from unstract.sdk.tool.base import BaseTool
7+
from unstract.sdk.utils import ToolUtils
8+
9+
10+
class IndexingUtils:
11+
@staticmethod
12+
def generate_index_key(
13+
vector_db: str,
14+
embedding: str,
15+
x2text: str,
16+
chunk_size: str,
17+
chunk_overlap: str,
18+
tool: BaseTool,
19+
file_path: Optional[str] = None,
20+
file_hash: Optional[str] = None,
21+
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
22+
) -> str:
23+
"""Generates a unique index key based on the provided configuration,
24+
file information, instance identifiers, and processing options.
25+
26+
Args:
27+
fs (FileStorage, optional): File storage for remote storage.
28+
29+
Returns:
30+
str: A unique index key used for indexing the document.
31+
"""
32+
if not file_path and not file_hash:
33+
raise ValueError("One of `file_path` or `file_hash` need to be provided")
34+
35+
if not file_hash:
36+
file_hash = fs.get_hash_from_file(path=file_path)
37+
38+
# Whole adapter config is used currently even though it contains some keys
39+
# which might not be relevant to indexing. This is easier for now than
40+
# marking certain keys of the adapter config as necessary.
41+
index_key = {
42+
"file_hash": file_hash,
43+
"vector_db_config": ToolAdapter.get_adapter_config(tool, vector_db),
44+
"embedding_config": ToolAdapter.get_adapter_config(tool, embedding),
45+
"x2text_config": ToolAdapter.get_adapter_config(tool, x2text),
46+
# Typed and hashed as strings since the final hash is persisted
47+
# and this is required to be backward compatible
48+
"chunk_size": str(chunk_size),
49+
"chunk_overlap": str(chunk_overlap),
50+
}
51+
# JSON keys are sorted to ensure that the same key gets hashed even in
52+
# case where the fields are reordered.
53+
hashed_index_key = ToolUtils.hash_str(json.dumps(index_key, sort_keys=True))
54+
return hashed_index_key

0 commit comments

Comments
 (0)