Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions src/unstructured_client/_hooks/custom/logger_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,10 @@ def after_error(
if response and response.status_code == 200:
# NOTE: Even though this is an after_error method, due to split_pdf_hook logic we may get
# a success here when one of the split requests was partitioned successfully
logger.info("Successfully partitioned the document.")

else:
logger.error("Failed to partition the document.")
if response:
logger.error("Server responded with %d - %s", response.status_code, response.text)
if error is not None:
logger.error("Following error occurred - %s", error)

return response, error
logger.error("Failed to partition the document.")
if response:
logger.error("Server responded with %d - %s", response.status_code, response.text)
if error is not None:
logger.error("Following error occurred - %s", error)
return response, error
2 changes: 1 addition & 1 deletion src/unstructured_client/_hooks/custom/pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def get_pdf_pages(
new_pdf.write(pdf_buffer)
pdf_buffer.seek(0)

yield pdf_buffer, offset, offset_end
yield pdf_buffer, offset
offset += split_size


Expand Down
38 changes: 1 addition & 37 deletions src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,39 +222,32 @@ def before_request(
if split_pdf_page is None or split_pdf_page == "false":
return request

logger.info("Preparing to split document for partition.")
file = form_data.get(PARTITION_FORM_FILES_KEY)
if (
file is None
or not isinstance(file, shared.Files)
or not pdf_utils.is_pdf(file)
):
logger.info("Partitioning without split.")
return request

starting_page_number = form_utils.get_starting_page_number(
form_data,
key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
)
if starting_page_number > 1:
logger.info("Starting page number set to %d", starting_page_number)
logger.info("Starting page number set to %d", starting_page_number)

self.allow_failed = form_utils.get_split_pdf_allow_failed_param(
form_data,
key=PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
fallback_value=DEFAULT_ALLOW_FAILED,
)
logger.info("Allow failed set to %d", self.allow_failed)

concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
form_data,
key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
fallback_value=DEFAULT_CONCURRENCY_LEVEL,
max_allowed=MAX_CONCURRENCY_LEVEL,
)
logger.info("Concurrency level set to %d", concurrency_level)
limiter = asyncio.Semaphore(concurrency_level)

content = cast(bytes, file.content)
Expand All @@ -267,40 +260,17 @@ def before_request(
)

page_count = page_range_end - page_range_start + 1
logger.info(
"Splitting pages %d to %d (%d total)",
page_range_start,
page_range_end,
page_count,
)

split_size = get_optimal_split_size(
num_pages=page_count, concurrency_level=concurrency_level
)
logger.info("Determined optimal split size of %d pages.", split_size)

# If the doc is small enough, and we aren't slicing it with a page range:
# do not split, just continue with the original request
if split_size >= page_count and page_count == len(pdf.pages):
logger.info(
"Document has too few pages (%d) to be split efficiently. Partitioning without split.",
page_count,
)
return request

pages = pdf_utils.get_pdf_pages(pdf, split_size=split_size, page_start=page_range_start, page_end=page_range_end)
logger.info(
"Partitioning %d files with %d page(s) each.",
math.floor(page_count / split_size),
split_size,
)

# Log the remainder pages if there are any
if page_count % split_size > 0:
logger.info(
"Partitioning 1 file with %d page(s).",
page_count % split_size,
)

# Use a variable to adjust the httpx client timeout, or default to 30 minutes
# When we're able to reuse the SDK to make these calls, we can remove this var
Expand All @@ -326,14 +296,8 @@ async def call_api_partial(page):

self.coroutines_to_execute[operation_id] = []
set_index = 1
for page_content, page_index, all_pages_number in pages:
for page_content, page_index in pages:
page_number = page_index + starting_page_number
logger.info(
"Partitioning set #%d (pages %d-%d).",
set_index,
page_number,
min(page_number + split_size - 1, all_pages_number),
)

coroutine = call_api_partial((page_content, page_number))
self.coroutines_to_execute[operation_id].append(coroutine)
Expand Down