Skip to content

Commit 014f2fe

Browse files
committed
Merge with the last version
1 parent eb43642 commit 014f2fe

File tree

6 files changed

+29
-37
lines changed

6 files changed

+29
-37
lines changed

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,16 +1518,17 @@ def test_document_to_element_list_sets_category_depth_titles():
15181518
assert elements[2].metadata.category_depth is None
15191519
assert elements[3].metadata.category_depth == 0
15201520

1521+
15211522
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
15221523
@pytest.mark.parametrize(
15231524
("strategy", "origin"),
15241525
# fast: can't capture the "intentionally left blank page" page
15251526
# others: will ignore the actual blank page
15261527
[
1527-
(PartitionStrategy.FAST, {"pdfminer"}),
1528-
(PartitionStrategy.FAST, {"pdfminer"}),
1528+
(PartitionStrategy.FAST, {"pdfminer"}),
1529+
(PartitionStrategy.FAST, {"pdfminer"}),
15291530
(PartitionStrategy.HI_RES, {"yolox", "pdfminer", "ocr_tesseract"}),
1530-
(PartitionStrategy.OCR_ONLY, {"ocr_tesseract"}),
1531+
(PartitionStrategy.OCR_ONLY, {"ocr_tesseract"}),
15311532
],
15321533
)
15331534
def test_partition_pdf_with_password(
@@ -1540,28 +1541,21 @@ def test_partition_pdf_with_password(
15401541
def _test(result):
15411542
# validate that the result is a non-empty list of dicts
15421543
assert len(result) == 1
1543-
assert result[0].text == 'File with password'
1544+
assert result[0].text == "File with password"
15441545

15451546
if file_mode == "filename":
1546-
result = pdf.partition_pdf(
1547-
filename=filename, strategy=strategy,
1548-
password="password"
1549-
)
1547+
result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password")
15501548
_test(result)
15511549
elif file_mode == "rb":
15521550
with open(filename, "rb") as f:
1553-
result = pdf.partition_pdf(
1554-
file=f, strategy=strategy,
1555-
password="password"
1556-
)
1551+
result = pdf.partition_pdf(file=f, strategy=strategy, password="password")
15571552
_test(result)
15581553
else:
15591554
with open(filename, "rb") as test_file:
15601555
spooled_temp_file = SpooledTemporaryFile()
15611556
spooled_temp_file.write(test_file.read())
15621557
spooled_temp_file.seek(0)
15631558
result = pdf.partition_pdf(
1564-
file=spooled_temp_file, strategy=strategy,
1565-
password="password"
1559+
file=spooled_temp_file, strategy=strategy, password="password"
15661560
)
15671561
_test(result)

unstructured/partition/image.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def partition_image(
3232
starting_page_number: int = 1,
3333
extract_forms: bool = False,
3434
form_extraction_skip_tables: bool = True,
35-
password:Optional[str]=None,
35+
password: Optional[str] = None,
3636
**kwargs: Any,
3737
) -> list[Element]:
3838
"""Parses an image into a list of interpreted elements.

unstructured/partition/pdf.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@
1212
import numpy as np
1313
import wrapt
1414
from pdfminer import psparser
15-
from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox
16-
from pdfminer.pdftypes import PDFObjRef
15+
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
1716
from pdfminer.utils import open_filename
1817
from pi_heif import register_heif_opener
1918
from PIL import Image as PILImage
@@ -365,7 +364,7 @@ def extractable_elements(
365364
languages: Optional[list[str]] = None,
366365
metadata_last_modified: Optional[str] = None,
367366
starting_page_number: int = 1,
368-
password:Optional[str] = None,
367+
password: Optional[str] = None,
369368
**kwargs: Any,
370369
) -> list[list[Element]]:
371370
if isinstance(file, bytes):
@@ -387,7 +386,7 @@ def _partition_pdf_with_pdfminer(
387386
languages: list[str],
388387
metadata_last_modified: Optional[str],
389388
starting_page_number: int = 1,
390-
password:Optional[str] = None,
389+
password: Optional[str] = None,
391390
**kwargs: Any,
392391
) -> list[list[Element]]:
393392
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@@ -446,7 +445,7 @@ def _process_pdfminer_pages(
446445

447446
for page_number, (page, page_layout) in enumerate(
448447
open_pdfminer_pages_generator(fp, password=password),
449-
start=starting_page_number,
448+
start=starting_page_number,
450449
):
451450
width, height = page_layout.width, page_layout.height
452451

@@ -568,7 +567,7 @@ def _partition_pdf_or_image_local(
568567
extract_forms: bool = False,
569568
form_extraction_skip_tables: bool = True,
570569
pdf_hi_res_max_pages: Optional[int] = None,
571-
password:Optional[str] = None,
570+
password: Optional[str] = None,
572571
**kwargs: Any,
573572
) -> list[Element]:
574573
"""Partition using package installed locally"""
@@ -609,8 +608,7 @@ def _partition_pdf_or_image_local(
609608
)
610609

611610
extracted_layout, layouts_links = (
612-
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi,
613-
password=password)
611+
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password)
614612
if pdf_text_extractable
615613
else ([], [])
616614
)
@@ -879,8 +877,7 @@ def _partition_pdf_or_image_with_ocr(
879877
elements.extend(page_elements)
880878
else:
881879
for page_number, image in enumerate(
882-
convert_pdf_to_images(filename, file, password=password),
883-
start=starting_page_number
880+
convert_pdf_to_images(filename, file, password=password), start=starting_page_number
884881
):
885882
page_elements = _partition_pdf_or_image_with_ocr_from_image(
886883
image=image,
@@ -1177,11 +1174,11 @@ def document_to_element_list(
11771174
word = ""
11781175

11791176
if len(word) == 0:
1180-
start_index = text_len + index
1181-
x1 = character.x0
1182-
y2 = height - character.y0
1183-
x2 = character.x1
1184-
y1 = height - character.y1
1177+
text_len + index
1178+
character.x0
1179+
height - character.y0
1180+
character.x1
1181+
height - character.y1
11851182
else:
11861183

11871184
element.metadata.links = (

unstructured/partition/pdf_image/pdf_image_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def convert_pdf_to_image(
5858
dpi: int = 200,
5959
output_folder: Optional[Union[str, PurePath]] = None,
6060
path_only: bool = False,
61-
password:Optional[str] = None,
61+
password: Optional[str] = None,
6262
) -> Union[List[Image.Image], List[str]]:
6363
"""Get the image renderings of the pdf pages using pdf2image"""
6464

@@ -127,7 +127,7 @@ def save_elements(
127127
is_image: bool = False,
128128
extract_image_block_to_payload: bool = False,
129129
output_dir_path: str | None = None,
130-
password:Optional[str] = None,
130+
password: Optional[str] = None,
131131
):
132132
"""
133133
Saves specific elements from a PDF as images either to a directory or embeds them in the
@@ -393,7 +393,7 @@ def convert_pdf_to_images(
393393
filename: str = "",
394394
file: Optional[bytes | IO[bytes]] = None,
395395
chunk_size: int = 10,
396-
password:Optional[str] = None,
396+
password: Optional[str] = None,
397397
) -> Iterator[Image.Image]:
398398
# Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on)
399399
exactly_one(filename=filename, file=file)

unstructured/partition/pdf_image/pdfminer_processing.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def process_file_with_pdfminer(
5151
def process_data_with_pdfminer(
5252
file: Optional[Union[bytes, BinaryIO]] = None,
5353
dpi: int = 200,
54-
password:Optional[str]=None,
54+
password: Optional[str] = None,
5555
) -> tuple[List[List["TextRegion"]], List[List]]:
5656
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
5757
pdf pages using pdf2image"""
@@ -65,8 +65,9 @@ def process_data_with_pdfminer(
6565
layouts_links = []
6666
# Coefficient to rescale bounding box to be compatible with images
6767
coef = dpi / 72
68-
for page_number, (page, page_layout) in (
69-
enumerate(open_pdfminer_pages_generator(file, password=password))):
68+
for page_number, (page, page_layout) in enumerate(
69+
open_pdfminer_pages_generator(file, password=password)
70+
):
7071
width, height = page_layout.width, page_layout.height
7172

7273
text_layout = []

unstructured/partition/pdf_image/pdfminer_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def rect_to_bbox(
7373
@requires_dependencies(["pikepdf", "pypdf"])
7474
def open_pdfminer_pages_generator(
7575
fp: BinaryIO,
76-
password:Optional[str]=None,
76+
password: Optional[str] = None,
7777
):
7878
"""Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
7979

0 commit comments

Comments
 (0)