1+ import json
12from typing import Optional
23
34from llama_index .core import Document
1415from unstract .adapters .exceptions import AdapterError
1516from unstract .adapters .x2text .x2text_adapter import X2TextAdapter
1617
18+ from unstract .sdk .adapters import ToolAdapter
1719from unstract .sdk .constants import LogLevel , ToolEnv
1820from unstract .sdk .embedding import ToolEmbedding
1921from unstract .sdk .exceptions import IndexingError , SdkError
2022from unstract .sdk .tool .base import BaseTool
2123from unstract .sdk .utils import ToolUtils
22- from unstract .sdk .utils .callback_manager import (
23- CallbackManager as UNCallbackManager ,
24- )
24+ from unstract .sdk .utils .callback_manager import CallbackManager as UNCallbackManager
2525from unstract .sdk .vector_db import ToolVectorDB
2626from unstract .sdk .x2txt import X2Text
2727
@@ -31,18 +31,9 @@ def __init__(self, tool: BaseTool):
3131 # TODO: Inherit from StreamMixin and avoid using BaseTool
3232 self .tool = tool
3333
34- def get_text_from_index (
35- self , embedding_type : str , vector_db : str , doc_id : str
36- ):
34+ def get_text_from_index (self , embedding_type : str , vector_db : str , doc_id : str ):
3735 embedd_helper = ToolEmbedding (tool = self .tool )
38- embedding_li = embedd_helper .get_embedding (
39- adapter_instance_id = embedding_type
40- )
41- if embedding_li is None :
42- self .tool .stream_log (
43- f"Error loading { embedding_type } " , level = LogLevel .ERROR
44- )
45- raise SdkError (f"Error loading { embedding_type } " )
36+ embedding_li = embedd_helper .get_embedding (adapter_instance_id = embedding_type )
4637 embedding_dimension = embedd_helper .get_embedding_length (embedding_li )
4738
4839 vdb_helper = ToolVectorDB (
@@ -53,12 +44,6 @@ def get_text_from_index(
5344 embedding_dimension = embedding_dimension ,
5445 )
5546
56- if vector_db_li is None :
57- self .tool .stream_log (
58- f"Error loading { vector_db } " , level = LogLevel .ERROR
59- )
60- raise SdkError (f"Error loading { vector_db } " )
61-
6247 try :
6348 self .tool .stream_log (f">>> Querying { vector_db } ..." )
6449 self .tool .stream_log (f">>> { doc_id } " )
@@ -149,48 +134,33 @@ def index_file(
149134 Returns:
150135 str: A unique ID for the file and indexing arguments combination
151136 """
152- # Make file content hash if not available
153- if not file_hash :
154- file_hash = ToolUtils .get_hash_from_file (file_path = file_path )
155-
156- doc_id = ToolIndex .generate_file_id (
137+ doc_id = self .generate_file_id (
157138 tool_id = tool_id ,
158- file_hash = file_hash ,
159139 vector_db = vector_db ,
160140 embedding = embedding_type ,
161141 x2text = x2text_adapter ,
162- chunk_size = chunk_size ,
163- chunk_overlap = chunk_overlap ,
142+ chunk_size = str (chunk_size ),
143+ chunk_overlap = str (chunk_overlap ),
144+ file_path = file_path ,
145+ file_hash = file_hash ,
164146 )
165-
166147 self .tool .stream_log (f"Checking if doc_id { doc_id } exists" )
167148
168- vdb_helper = ToolVectorDB (
169- tool = self .tool ,
170- )
171-
149+ # Get embedding instance
172150 embedd_helper = ToolEmbedding (tool = self .tool )
151+ embedding_li = embedd_helper .get_embedding (adapter_instance_id = embedding_type )
152+ embedding_dimension = embedd_helper .get_embedding_length (embedding_li )
173153
174- embedding_li = embedd_helper .get_embedding (
175- adapter_instance_id = embedding_type
154+ # Get vectorDB instance
155+ vdb_helper = ToolVectorDB (
156+ tool = self .tool ,
176157 )
177- if embedding_li is None :
178- self .tool .stream_log (
179- f"Error loading { embedding_type } " , level = LogLevel .ERROR
180- )
181- raise SdkError (f"Error loading { embedding_type } " )
182-
183- embedding_dimension = embedd_helper .get_embedding_length (embedding_li )
184158 vector_db_li = vdb_helper .get_vector_db (
185159 adapter_instance_id = vector_db ,
186160 embedding_dimension = embedding_dimension ,
187161 )
188- if vector_db_li is None :
189- self .tool .stream_log (
190- f"Error loading { vector_db } " , level = LogLevel .ERROR
191- )
192- raise SdkError (f"Error loading { vector_db } " )
193162
163+ # Checking if document is already indexed against doc_id
194164 doc_id_eq_filter = MetadataFilter .from_dict (
195165 {"key" : "doc_id" , "operator" : FilterOperator .EQ , "value" : doc_id }
196166 )
@@ -275,26 +245,20 @@ def index_file(
275245 parser = SimpleNodeParser .from_defaults (
276246 chunk_size = len (documents [0 ].text ) + 10 , chunk_overlap = 0
277247 )
278- nodes = parser .get_nodes_from_documents (
279- documents , show_progress = True
280- )
248+ nodes = parser .get_nodes_from_documents (documents , show_progress = True )
281249 node = nodes [0 ]
282250 node .embedding = embedding_li .get_query_embedding (" " )
283251 vector_db_li .add (nodes = [node ])
284252 self .tool .stream_log ("Added node to vector db" )
285253 else :
286- storage_context = StorageContext .from_defaults (
287- vector_store = vector_db_li
288- )
254+ storage_context = StorageContext .from_defaults (vector_store = vector_db_li )
289255 parser = SimpleNodeParser .from_defaults (
290256 chunk_size = chunk_size , chunk_overlap = chunk_overlap
291257 )
292258
293259 # Set callback_manager to collect Usage stats
294260 callback_manager = UNCallbackManager .set_callback_manager (
295- platform_api_key = self .tool .get_env_or_die (
296- ToolEnv .PLATFORM_API_KEY
297- ),
261+ platform_api_key = self .tool .get_env_or_die (ToolEnv .PLATFORM_API_KEY ),
298262 embedding = embedding_li ,
299263 )
300264
@@ -319,31 +283,53 @@ def index_file(
319283 self .tool .stream_log ("File has been indexed successfully" )
320284 return doc_id
321285
322- @staticmethod
323286 def generate_file_id (
287+ self ,
324288 tool_id : str ,
325- file_hash : str ,
326289 vector_db : str ,
327290 embedding : str ,
328291 x2text : str ,
329292 chunk_size : str ,
330293 chunk_overlap : str ,
294+ file_path : Optional [str ] = None ,
295+ file_hash : Optional [str ] = None ,
331296 ) -> str :
332297 """Generates a unique ID useful for identifying files during indexing.
333298
334299 Args:
335- tool_id (str): Unique ID of the tool developed / exported
336- file_hash (str): Hash of the file contents
300+ tool_id (str): Unique ID of the tool or workflow
337301 vector_db (str): UUID of the vector DB adapter
338302 embedding (str): UUID of the embedding adapter
339303 x2text (str): UUID of the X2Text adapter
340304 chunk_size (str): Chunk size for indexing
341305 chunk_overlap (str): Chunk overlap for indexing
306+ file_path (Optional[str]): Path to the file that needs to be indexed.
307+ Defaults to None. One of file_path or file_hash needs to be specified.
308+ file_hash (Optional[str], optional): SHA256 hash of the file.
309+ Defaults to None. If None, the hash is generated with file_path.
342310
343311 Returns:
344312 str: Key representing unique ID for a file
345313 """
346- return (
347- f"{ tool_id } |{ vector_db } |{ embedding } |{ x2text } |"
348- f"{ chunk_size } |{ chunk_overlap } |{ file_hash } "
349- )
314+ if not file_path and not file_hash :
315+ raise ValueError ("One of `file_path` or `file_hash` need to be provided" )
316+
317+ if not file_hash :
318+ file_hash = ToolUtils .get_hash_from_file (file_path = file_path )
319+
320+ # Whole adapter config is used currently even though it contains some keys
321+ # which might not be relevant to indexing. This is easier for now than
322+ # marking certain keys of the adapter config as necessary.
323+ index_key = {
324+ "tool_id" : tool_id ,
325+ "file_hash" : file_hash ,
326+ "vector_db_config" : ToolAdapter .get_adapter_config (self .tool , vector_db ),
327+ "embedding_config" : ToolAdapter .get_adapter_config (self .tool , embedding ),
328+ "x2text_config" : ToolAdapter .get_adapter_config (self .tool , x2text ),
329+ "chunk_size" : chunk_size ,
330+ "chunk_overlap" : chunk_overlap ,
331+ }
332+ # JSON keys are sorted to ensure that the same key gets hashed even in
333+ # case where the fields are reordered.
334+ hashed_index_key = ToolUtils .hash_str (json .dumps (index_key , sort_keys = True ))
335+ return hashed_index_key
0 commit comments