Merge pull request #7 from Zipstack/index-file-update-with-x2text-adapter-usage

nehabagdia · web-flow · commit 069322c4722b · 2024-02-24T11:29:56.000+05:30
fix: Updated index_file() to use x2text adapter
diff --git a/pdm.lock b/pdm.lock
diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.10.1"
+__version__ = "0.11.0"
 
 
 def get_sdk_version():
diff --git a/src/unstract/sdk/exceptions.py b/src/unstract/sdk/exceptions.py
@@ -11,6 +11,3 @@ def __init__(
     @property
     def user_message(self) -> Optional[str]:
         return self._user_message
-
-    def __str__(self) -> str:
-        return f"{self.message}"
diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py
@@ -1,12 +1,9 @@
-import os
-import shutil
-import zipfile
 from typing import Optional
 
-import filetype
 from llama_index import Document, StorageContext, VectorStoreIndex
 from llama_index.node_parser import SimpleNodeParser
 from llama_index.vector_stores import VectorStoreQuery, VectorStoreQueryResult
+from unstract.adapters.x2text.x2text_adapter import X2TextAdapter
 
 from unstract.sdk.constants import LogLevel, ToolEnv
 from unstract.sdk.embedding import ToolEmbedding
@@ -15,12 +12,7 @@
 from unstract.sdk.utils import ToolUtils
 from unstract.sdk.utils.service_context import ServiceContext
 from unstract.sdk.vector_db import ToolVectorDB
-
-allowed_pdf_to_text_converters = [
-    "default",
-    "unstract_llm_whisperer",
-    "unstract_camelot",
-]
+from unstract.sdk.x2txt import X2Text
 
 
 class ToolIndex:
@@ -106,93 +98,30 @@ def index_file(
         tool_id: str,
         embedding_type: str,
         vector_db: str,
+        x2text_adapter: str,
         file_path: str,
         chunk_size: int,
         chunk_overlap: int,
         reindex: bool = False,
-        converter: str = "default",
         file_hash: Optional[str] = None,
     ):
-        if converter not in allowed_pdf_to_text_converters:
-            self.tool.stream_log(
-                "pdf-to-text-converters must be one of "
-                f"{allowed_pdf_to_text_converters}",
-                level=LogLevel.ERROR,
-            )
-            raise SdkException(
-                "pdf-to-text-converters must be one of "
-                f"{allowed_pdf_to_text_converters}"
-            )
-
-        input_file_type = None
-        input_file_type_mime = None
-
         # Make file content hash if not available
         if not file_hash:
             file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
-        with open(file_path, mode="rb") as input_file_obj:
-            sample_contents = input_file_obj.read(100)
-            input_file_type = filetype.guess(sample_contents)
-
-        if input_file_type is None:
-            input_file_type_mime = "text/plain"
-        else:
-            input_file_type_mime = input_file_type.MIME
-
-        self.tool.stream_log(f"Input file type: {input_file_type_mime}")
 
+        self.tool.stream_log("Extracting text from input file")
         full_text = []
-
-        if input_file_type_mime == "text/plain":
-            with open(file_path) as input_file_obj:
-                full_text.append(
-                    {
-                        "section": "full",
-                        "text_contents": self._cleanup_text(
-                            input_file_obj.read()
-                        ),
-                    }
-                )
-
-        elif input_file_type_mime == "application/pdf":
-            raise SdkException(
-                "Indexing of PDF files is not supported currently"
-            )
-            # TODO: Make use of adapters to convert X2Text
-            # self.tool.stream_log(f"PDF to text converter: {converter}")
-            # if converter == "unstract_llm_whisperer" or converter == "default":  # noqa
-            #     full_text.append(
-            #         {
-            #             "section": "full",
-            #             "text_contents": self._cleanup_text(
-            #                 x2txt.generate_whisper(
-            #                     input_file=file_path,
-            #                     mode="text",
-            #                     dump_text=True,
-            #                 )
-            #             ),
-            #         }
-            #     )
-            # else:
-            #     # TODO : Support for Camelot
-            #     x2txt = X2Text(tool=self.tool)
-
-        elif input_file_type_mime == "application/zip":
-            self.tool.stream_log("Zip file extraction required")
-            with zipfile.ZipFile(file_path, "r") as zip_ref:
-                file_name_from_path = os.path.basename(file_path)
-                temp_directory = f"/tmp/unstract_zip/{file_name_from_path}"
-                # If temp_directory exists, delete it and create it again
-                if os.path.exists(temp_directory):
-                    shutil.rmtree(temp_directory)
-                os.makedirs(temp_directory)
-                zip_ref.extractall(temp_directory)
-        else:
-            self.tool.stream_log(
-                f"Unsupported file type: {input_file_type_mime}",
-                level=LogLevel.ERROR,
-            )
-            raise SdkException(f"Unsupported file type: {input_file_type_mime}")
+        x2text = X2Text(tool=self.tool)
+        x2text_adapter: X2TextAdapter = x2text.get_x2text(
+            adapter_instance_id=x2text_adapter
+        )
+        extracted_text = x2text_adapter.process(input_file_path=file_path)
+        full_text.append(
+            {
+                "section": "full",
+                "text_contents": self._cleanup_text(extracted_text),
+            }
+        )
 
         doc_id = ToolIndex.generate_file_id(
             tool_id=tool_id,
diff --git a/src/unstract/sdk/tool/validator.py b/src/unstract/sdk/tool/validator.py
@@ -3,11 +3,12 @@
 from pathlib import Path
 from typing import Any
 
-import magic
 from jsonschema import Draft202012Validator, ValidationError, validators
+
 from unstract.sdk.constants import MetadataKey, PropKey
 from unstract.sdk.tool.base import BaseTool
 from unstract.sdk.tool.mime_types import EXT_MIME_MAP
+from unstract.sdk.utils import ToolUtils
 
 
 def extend_with_default(validator_class: Any) -> Any:
@@ -211,26 +212,10 @@ def _validate_file_type(self, input_file: Path) -> None:
                 )
             allowed_mimes.append(EXT_MIME_MAP[ext])
 
-        input_file_mime = self._get_file_mime(input_file=input_file)
+        input_file_mime = ToolUtils.get_file_mime_type(input_file=input_file)
+        self.tool.stream_log(f"Input file MIME: {input_file_mime}")
         if input_file_mime not in allowed_mimes:
             self.tool.stream_error_and_exit(
                 f"File type of {input_file_mime} is not supported by"
                 " the tool, check its PROPERTIES for a list of supported types"
             )
-
-    def _get_file_mime(self, input_file: Path) -> str:
-        """Gets the file MIME type for an input file. Uses libmagic to perform
-        the same.
-
-        Args:
-            input_file (Path): Path object of the input file
-
-        Returns:
-            str: MIME type of the file
-        """
-        input_file_mime = ""
-        with open(input_file, mode="rb") as input_file_obj:
-            sample_contents = input_file_obj.read(100)
-            input_file_mime = magic.from_buffer(sample_contents, mime=True)
-        self.tool.stream_log(f"Input file MIME: {input_file_mime}")
-        return input_file_mime
diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py
@@ -1,7 +1,10 @@
 import json
 from hashlib import md5, sha256
+from pathlib import Path
 from typing import Any
 
+import magic
+
 from unstract.sdk.constants import FileReaderSettings
 
 
@@ -75,3 +78,21 @@ def json_to_str(json_to_dump: dict[str, Any]) -> str:
         """
         compact_json = json.dumps(json_to_dump, separators=(",", ":"))
         return compact_json
+
+    @staticmethod
+    def get_file_mime_type(self, input_file: Path) -> str:
+        """Gets the file MIME type for an input file. Uses libmagic to perform
+        the same.
+
+        Args:
+            input_file (Path): Path object of the input file
+
+        Returns:
+            str: MIME type of the file
+        """
+        input_file_mime = ""
+        with open(input_file, mode="rb") as input_file_obj:
+            sample_contents = input_file_obj.read(100)
+            input_file_mime = magic.from_buffer(sample_contents, mime=True)
+            input_file_obj.seek(0)
+        return input_file_mime

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.10.1"`
	`1`	`+__version__ = "0.11.0"`
`2`	`2`
`3`	`3`
`4`	`4`	`def get_sdk_version():`