Merge pull request #8 from Zipstack/fix-index-key-generation-with-x2text

nehabagdia · web-flow · commit 932d55715dbb · 2024-02-24T16:06:54.000+05:30
fix: Index key generation with x2text
diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.11.0"
+__version__ = "0.11.1"
 
 
 def get_sdk_version():
diff --git a/src/unstract/sdk/constants.py b/src/unstract/sdk/constants.py
@@ -139,11 +139,13 @@ class ToolSettingsKey:
         LLM_ADAPTER_ID (str): The key for the LLM adapter ID.
         EMBEDDING_ADAPTER_ID (str): The key for the embedding adapter ID.
         VECTOR_DB_ADAPTER_ID (str): The key for the vector DB adapter ID.
+        X2TEXT_ADAPTER_ID (str): The key for the X2Text adapter ID.
     """
 
     LLM_ADAPTER_ID = "llmAdapterId"
     EMBEDDING_ADAPTER_ID = "embeddingAdapterId"
     VECTOR_DB_ADAPTER_ID = "vectorDbAdapterId"
+    X2TEXT_ADAPTER_ID = "x2TextAdapterId"
 
 
 class FileReaderSettings:
diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py
@@ -112,10 +112,10 @@ def index_file(
         self.tool.stream_log("Extracting text from input file")
         full_text = []
         x2text = X2Text(tool=self.tool)
-        x2text_adapter: X2TextAdapter = x2text.get_x2text(
+        x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
             adapter_instance_id=x2text_adapter
         )
-        extracted_text = x2text_adapter.process(input_file_path=file_path)
+        extracted_text = x2text_adapter_inst.process(input_file_path=file_path)
         full_text.append(
             {
                 "section": "full",
@@ -128,6 +128,7 @@ def index_file(
             file_hash=file_hash,
             vector_db=vector_db,
             embedding=embedding_type,
+            x2text=x2text_adapter,
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
         )
@@ -257,6 +258,7 @@ def generate_file_id(
         file_hash: str,
         vector_db: str,
         embedding: str,
+        x2text: str,
         chunk_size: str,
         chunk_overlap: str,
     ) -> str:
@@ -267,13 +269,14 @@ def generate_file_id(
             file_hash (str): Hash of the file contents
             vector_db (str): UUID of the vector DB adapter
             embedding (str): UUID of the embedding adapter
+            x2text (str): UUID of the X2Text adapter
             chunk_size (str): Chunk size for indexing
             chunk_overlap (str): Chunk overlap for indexing
 
         Returns:
             str: Key representing unique ID for a file
         """
         return (
-            f"{tool_id}|{vector_db}|{embedding}|"
+            f"{tool_id}|{vector_db}|{embedding}|{x2text}|"
             f"{chunk_size}|{chunk_overlap}|{file_hash}"
         )
diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py
@@ -80,7 +80,7 @@ def json_to_str(json_to_dump: dict[str, Any]) -> str:
         return compact_json
 
     @staticmethod
-    def get_file_mime_type(self, input_file: Path) -> str:
+    def get_file_mime_type(input_file: Path) -> str:
         """Gets the file MIME type for an input file. Uses libmagic to perform
         the same.
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.11.0"`
	`1`	`+__version__ = "0.11.1"`
`2`	`2`
`3`	`3`
`4`	`4`	`def get_sdk_version():`