Skip to content

Commit ea8ac30

Browse files
authored
Merge branch 'main' into python-upgrade
2 parents 7c78b08 + 499f8e0 commit ea8ac30

File tree

3 files changed

+8
-5
lines changed

3 files changed

+8
-5
lines changed

src/unstract/sdk/index.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,10 @@ def extract_text(
162162
usage_kwargs=usage_kwargs,
163163
)
164164
try:
165-
if enable_highlight and (isinstance(x2text.x2text_instance, LLMWhisperer) or isinstance(x2text.x2text_instance, LLMWhispererV2)):
165+
if enable_highlight and (
166+
isinstance(x2text.x2text_instance, LLMWhisperer)
167+
or isinstance(x2text.x2text_instance, LLMWhispererV2)
168+
):
166169
process_response: TextExtractionResult = x2text.process(
167170
input_file_path=file_path,
168171
output_file_path=output_file_path,
@@ -172,7 +175,7 @@ def extract_text(
172175
)
173176
whisper_hash_value = process_response.extraction_metadata.whisper_hash
174177
metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}
175-
if hasattr(self.tool, 'update_exec_metadata'):
178+
if hasattr(self.tool, "update_exec_metadata"):
176179
self.tool.update_exec_metadata(metadata)
177180
else:
178181
process_response: TextExtractionResult = x2text.process(
@@ -202,6 +205,7 @@ def extract_text(
202205
return extracted_text
203206

204207
# TODO: Reduce the number of params by some dataclass
208+
# TODO: Deprecate and remove `process_text` argument
205209
@log_elapsed(operation="CHECK_AND_INDEX(overall)")
206210
@capture_metrics
207211
def index(

src/unstract/sdk/tool/executor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def execute(self, args: argparse.Namespace) -> None:
3636

3737
def _setup_for_run(self) -> None:
3838
"""Helps initialize tool execution for RUN command."""
39+
# TODO: Review if file storage equivalent is needed or remove this
3940
shutil.rmtree(self.tool.get_output_dir(), ignore_errors=True)
4041
Path(self.tool.get_output_dir()).mkdir(parents=True, exist_ok=True)
4142

src/unstract/sdk/x2txt.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,15 +123,12 @@ def push_usage_details(
123123
) -> None:
124124
file_size = ToolUtils.get_file_size(input_file_path, fs)
125125

126-
self._x2text_instance
127-
128126
if mime_type == MimeType.PDF:
129127
pdf_contents = io.BytesIO(fs.read(path=input_file_path, mode="rb"))
130128
with pdfplumber.open(pdf_contents) as pdf:
131129
# calculate the number of pages
132130
page_count = len(pdf.pages)
133131
if isinstance(self._x2text_instance, LLMWhisperer):
134-
self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT)
135132
page_count = ToolUtils.calculate_page_count(
136133
self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT),
137134
page_count,
@@ -144,6 +141,7 @@ def push_usage_details(
144141
kwargs=self._usage_kwargs,
145142
)
146143
else:
144+
# TODO: Calculate page usage for other file types (3000 words = 1 page)
147145
# We are allowing certain image types,and raw texts. We will consider them
148146
# as single page documents as there in no concept of page numbers.
149147
Audit().push_page_usage_data(

0 commit comments

Comments
 (0)