Skip to content

Commit 1ed1ad7

Browse files
fix: Fixed high CPU usage issue related to index (#18)
Fixed high CPU usage issue by avoiding extracting before check for index. Bumped SDK to 0.15.1 and removed unused packages
1 parent aec8399 commit 1ed1ad7

File tree

4 files changed

+24
-155
lines changed

4 files changed

+24
-155
lines changed

pdm.lock

Lines changed: 1 addition & 129 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,6 @@ dependencies = [
1919
"llama-index==0.9.28",
2020
"tiktoken~=0.4.0",
2121
"transformers==4.37.0",
22-
# LLM Whisperer dependencies
23-
"filetype==1.2.0",
24-
"pdfplumber==0.10.3",
25-
"pytesseract==0.3.10",
2622
]
2723
requires-python = ">=3.9,<3.11.1"
2824
readme = "README.md"

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.15.0"
1+
__version__ = "0.15.1"
22

33

44
def get_sdk_version():

src/unstract/sdk/index.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
class ToolIndex:
2020
def __init__(self, tool: BaseTool):
21+
# TODO: Inherit from StreamMixin and avoid using BaseTool
2122
self.tool = tool
2223

2324
def get_text_from_index(
@@ -133,27 +134,6 @@ def index_file(
133134
if not file_hash:
134135
file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
135136

136-
self.tool.stream_log("Extracting text from input file")
137-
full_text = []
138-
extracted_text = ""
139-
try:
140-
x2text = X2Text(tool=self.tool)
141-
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
142-
adapter_instance_id=x2text_adapter
143-
)
144-
extracted_text = x2text_adapter_inst.process(
145-
input_file_path=file_path, output_file_path=output_file_path
146-
)
147-
except AdapterError as e:
148-
# Wrapping AdapterErrors with SdkError
149-
raise SdkError(str(e)) from e
150-
full_text.append(
151-
{
152-
"section": "full",
153-
"text_contents": self._cleanup_text(extracted_text),
154-
}
155-
)
156-
157137
doc_id = ToolIndex.generate_file_id(
158138
tool_id=tool_id,
159139
file_hash=file_hash,
@@ -224,6 +204,27 @@ def index_file(
224204
doc_id_not_found = True
225205

226206
if doc_id_not_found:
207+
self.tool.stream_log("Extracting text from input file")
208+
full_text = []
209+
extracted_text = ""
210+
try:
211+
x2text = X2Text(tool=self.tool)
212+
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
213+
adapter_instance_id=x2text_adapter
214+
)
215+
extracted_text = x2text_adapter_inst.process(
216+
input_file_path=file_path, output_file_path=output_file_path
217+
)
218+
except AdapterError as e:
219+
# Wrapping AdapterErrors with SdkError
220+
raise SdkError(str(e)) from e
221+
full_text.append(
222+
{
223+
"section": "full",
224+
"text_contents": self._cleanup_text(extracted_text),
225+
}
226+
)
227+
227228
# Check if chunking is required
228229
documents = []
229230
for item in full_text:

0 commit comments

Comments
 (0)