|
| 1 | +import json |
| 2 | +from typing import Optional |
| 3 | + |
| 4 | +from unstract.sdk.adapter import ToolAdapter |
| 5 | +from unstract.sdk.file_storage import FileStorage, FileStorageProvider |
| 6 | +from unstract.sdk.tool.base import BaseTool |
| 7 | +from unstract.sdk.utils import ToolUtils |
| 8 | + |
| 9 | + |
| 10 | +class IndexingUtils: |
| 11 | + @staticmethod |
| 12 | + def generate_index_key( |
| 13 | + vector_db: str, |
| 14 | + embedding: str, |
| 15 | + x2text: str, |
| 16 | + chunk_size: str, |
| 17 | + chunk_overlap: str, |
| 18 | + tool: BaseTool, |
| 19 | + file_path: Optional[str] = None, |
| 20 | + file_hash: Optional[str] = None, |
| 21 | + fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), |
| 22 | + ) -> str: |
| 23 | + """Generates a unique index key based on the provided configuration, |
| 24 | + file information, instance identifiers, and processing options. |
| 25 | +
|
| 26 | + Args: |
| 27 | + fs (FileStorage, optional): File storage for remote storage. |
| 28 | +
|
| 29 | + Returns: |
| 30 | + str: A unique index key used for indexing the document. |
| 31 | + """ |
| 32 | + if not file_path and not file_hash: |
| 33 | + raise ValueError("One of `file_path` or `file_hash` need to be provided") |
| 34 | + |
| 35 | + if not file_hash: |
| 36 | + file_hash = fs.get_hash_from_file(path=file_path) |
| 37 | + |
| 38 | + # Whole adapter config is used currently even though it contains some keys |
| 39 | + # which might not be relevant to indexing. This is easier for now than |
| 40 | + # marking certain keys of the adapter config as necessary. |
| 41 | + index_key = { |
| 42 | + "file_hash": file_hash, |
| 43 | + "vector_db_config": ToolAdapter.get_adapter_config(tool, vector_db), |
| 44 | + "embedding_config": ToolAdapter.get_adapter_config(tool, embedding), |
| 45 | + "x2text_config": ToolAdapter.get_adapter_config(tool, x2text), |
| 46 | + # Typed and hashed as strings since the final hash is persisted |
| 47 | + # and this is required to be backward compatible |
| 48 | + "chunk_size": str(chunk_size), |
| 49 | + "chunk_overlap": str(chunk_overlap), |
| 50 | + } |
| 51 | + # JSON keys are sorted to ensure that the same key gets hashed even in |
| 52 | + # case where the fields are reordered. |
| 53 | + hashed_index_key = ToolUtils.hash_str(json.dumps(index_key, sort_keys=True)) |
| 54 | + return hashed_index_key |
0 commit comments