fix: Fixed high CPU usage issue related to index (#18)

chandrasekharan-zipstack · web-flow · commit 1ed1ad7f3218 · 2024-03-13T11:15:21.000+05:30
Fixed high CPU usage issue by avoiding extracting before check for index.
Bumped SDK to 0.15.1 and removed unused packages
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,10 +19,6 @@ dependencies = [
     "llama-index==0.9.28",
     "tiktoken~=0.4.0",
     "transformers==4.37.0",
-    # LLM Whisperer dependencies
-    "filetype==1.2.0",
-    "pdfplumber==0.10.3",
-    "pytesseract==0.3.10",
 ]
 requires-python = ">=3.9,<3.11.1"
 readme = "README.md"
diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.15.0"
+__version__ = "0.15.1"
 
 
 def get_sdk_version():
diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py
@@ -18,6 +18,7 @@
 
 class ToolIndex:
     def __init__(self, tool: BaseTool):
+        # TODO: Inherit from StreamMixin and avoid using BaseTool
         self.tool = tool
 
     def get_text_from_index(
@@ -133,27 +134,6 @@ def index_file(
         if not file_hash:
             file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
 
-        self.tool.stream_log("Extracting text from input file")
-        full_text = []
-        extracted_text = ""
-        try:
-            x2text = X2Text(tool=self.tool)
-            x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
-                adapter_instance_id=x2text_adapter
-            )
-            extracted_text = x2text_adapter_inst.process(
-                input_file_path=file_path, output_file_path=output_file_path
-            )
-        except AdapterError as e:
-            # Wrapping AdapterErrors with SdkError
-            raise SdkError(str(e)) from e
-        full_text.append(
-            {
-                "section": "full",
-                "text_contents": self._cleanup_text(extracted_text),
-            }
-        )
-
         doc_id = ToolIndex.generate_file_id(
             tool_id=tool_id,
             file_hash=file_hash,
@@ -224,6 +204,27 @@ def index_file(
             doc_id_not_found = True
 
         if doc_id_not_found:
+            self.tool.stream_log("Extracting text from input file")
+            full_text = []
+            extracted_text = ""
+            try:
+                x2text = X2Text(tool=self.tool)
+                x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
+                    adapter_instance_id=x2text_adapter
+                )
+                extracted_text = x2text_adapter_inst.process(
+                    input_file_path=file_path, output_file_path=output_file_path
+                )
+            except AdapterError as e:
+                # Wrapping AdapterErrors with SdkError
+                raise SdkError(str(e)) from e
+            full_text.append(
+                {
+                    "section": "full",
+                    "text_contents": self._cleanup_text(extracted_text),
+                }
+            )
+
             # Check if chunking is required
             documents = []
             for item in full_text:

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.15.0"`
	`1`	`+__version__ = "0.15.1"`
`2`	`2`
`3`	`3`
`4`	`4`	`def get_sdk_version():`