diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d2d8950c4..416911c440 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.18.11-dev0 ### Enhancements +- **Switch from pdfminer.six to PAVÉS** Increases robustness of PDF extraction and uses multiple CPUs when possible. No more need to patch pdfminer or repair pdfs with pikepdf. ### Features diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index b0caffbb95..b4a11ab4a0 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -4,8 +4,8 @@ onnx>=1.17.0 onnxruntime>=1.19.0 pdf2image -pdfminer.six -pikepdf +paves +playa-pdf>=0.6.2 pi_heif pypdf google-cloud-vision diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 4d2ef23532..6518deb748 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -33,8 +33,6 @@ cryptography==45.0.5 # pdfminer-six cycler==0.12.1 # via matplotlib -deprecated==1.2.18 - # via pikepdf effdet==0.4.1 # via -r ./extra-pdf-image.in filelock==3.18.0 @@ -88,10 +86,6 @@ jinja2==3.1.6 # via torch kiwisolver==1.4.8 # via matplotlib -lxml==6.0.0 - # via - # -c requirements/base.txt - # pikepdf markupsafe==3.0.2 # via jinja2 matplotlib==3.10.3 @@ -134,30 +128,30 @@ packaging==25.0 # huggingface-hub # matplotlib # onnxruntime - # pikepdf # transformers # unstructured-pytesseract pandas==2.3.1 # via unstructured-inference +paves==0.6.1 + # via -r extra-pdf-image.in pdf2image==1.17.0 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in pdfminer-six==20250327 # via - # -c requirements/deps/constraints.txt - # -r ./extra-pdf-image.in + # -c ./deps/constraints.txt # unstructured-inference pi-heif==1.0.0 - # via -r ./extra-pdf-image.in -pikepdf==9.9.0 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in pillow==11.3.0 # via # matplotlib + # paves # pdf2image # pi-heif - # pikepdf # torchvision # unstructured-pytesseract +playa-pdf==0.6.2 + # via paves proto-plus==1.26.1 # via # google-api-core @@ -274,7 +268,6 @@ typing-extensions==4.14.1 # -c requirements/base.txt # huggingface-hub # onnx - # pypdf # torch tzdata==2025.2 # via pandas @@ -287,7 +280,3 @@ urllib3==2.5.0 # -c requirements/base.txt # -c requirements/deps/constraints.txt # requests -wrapt==1.17.2 - # via - # -c requirements/base.txt - # deprecated diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 919ac89619..a50471de75 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1224,14 +1224,14 @@ def test_partition_pdf_with_fast_finds_headers_footers(): @pytest.mark.parametrize( ("filename", "expected_log"), [ - # This one is *actually* an invalid PDF document + # This one is *actually* an invalid PDF document, but we no longer need to repair it ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."), ], ) def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog): caplog.set_level(logging.INFO) assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}")) - assert expected_log in caplog.text + assert expected_log not in caplog.text @pytest.mark.parametrize( diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index 309ea1336f..3f4a7c4f07 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from pdfminer.layout import LAParams +from paves.miner import LAParams from PIL import Image from unstructured_inference.constants import Source as InferenceSource from unstructured_inference.inference.elements import ( diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py index 075a4e151e..2effe7eb75 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py @@ -1,6 +1,6 @@ from unittest.mock import MagicMock -from pdfminer.layout import LTContainer, LTTextLine +from paves.miner import LTContainer, LTTextLine from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index 94b12d5578..d2c0a1ce5f 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -125,8 +125,8 @@ def test_partition_msg_can_process_attachments(): assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5]) assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7]) - assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343]) - assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:]) + assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:341]) + assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[341:]) assert [e.text for e in elements[:5]] == [ "Here are those documents.", "--", diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html index c780303b30..81f092abee 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html @@ -58,12 +58,12 @@
In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our final solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply fine-tuning the question and passage encoders on existing question-passage pairs is sufficient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.
-
+
2 Background The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume -the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question. +the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question. e
diff --git a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md index 8e74e6d944..c64ab3495e 100644 --- a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md +++ b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md @@ -43,7 +43,7 @@ In this paper, we address the question: can we train a better dense embedding mo The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume -the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question. +the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question. e 3 Dense Passage Retriever (DPR) diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json index 829b9b7a7e..40c36de858 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json @@ -111,8 +111,8 @@ }, { "type": "CompositeElement", - "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e", - "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", + "element_id": "c2959a06eb5a6864c4f0c7d38e21b2e9", + "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", "metadata": { "data_source": { "record_locator": { diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 0efe69ed03..6849b435af 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -10,12 +10,10 @@ from typing import IO, TYPE_CHECKING, Any, Optional, cast import numpy as np -import wrapt -from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox -from pdfminer.utils import open_filename +import playa +from paves.miner import LTContainer, LTImage, LTItem, LTTextBox, resolve1 from pi_heif import register_heif_opener from PIL import Image as PILImage -from pypdf import PdfReader from unstructured_inference.inference.layout import DocumentLayout from unstructured_inference.inference.layoutelement import LayoutElement @@ -93,19 +91,12 @@ PartitionStrategy, ) from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements -from unstructured.patches.pdfminer import patch_psparser from unstructured.utils import first, requires_dependencies if TYPE_CHECKING: pass -# Correct a bug that was introduced by a previous patch to -# pdfminer.six, causing needless and unsuccessful repairing of PDFs -# which were not actually broken. -patch_psparser() - - RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL) @@ -439,38 +430,23 @@ def _partition_pdf_with_pdfminer( """ exactly_one(filename=filename, file=file) - if filename: - with open_filename(filename, "rb") as fp: - fp = cast(IO[bytes], fp) - elements = _process_pdfminer_pages( - fp=fp, - filename=filename, - languages=languages, - metadata_last_modified=metadata_last_modified, - starting_page_number=starting_page_number, - password=password, - pdfminer_config=pdfminer_config, - **kwargs, - ) - - elif file: - elements = _process_pdfminer_pages( - fp=file, - filename=filename, - languages=languages, - metadata_last_modified=metadata_last_modified, - starting_page_number=starting_page_number, - password=password, - pdfminer_config=pdfminer_config, - **kwargs, - ) + elements = _process_pdfminer_pages( + fp=file, + filename=filename, + languages=languages, + metadata_last_modified=metadata_last_modified, + starting_page_number=starting_page_number, + password=password, + pdfminer_config=pdfminer_config, + **kwargs, + ) return elements -@requires_dependencies("pdfminer") +@requires_dependencies("paves") def _process_pdfminer_pages( - fp: IO[bytes], + fp: Optional[IO[bytes]], filename: str, metadata_last_modified: Optional[str], languages: Optional[list[str]] = None, @@ -485,7 +461,9 @@ def _process_pdfminer_pages( elements = [] for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(fp, password=password, pdfminer_config=pdfminer_config), + open_pdfminer_pages_generator( + fp, filename, password=password, pdfminer_config=pdfminer_config + ), start=starting_page_number, ): width, height = page_layout.width, page_layout.height @@ -497,8 +475,9 @@ def _process_pdfminer_pages( width=width, height=height, ) - if page.annots: - annotation_list = get_uris(page.annots, height, coordinate_system, page_number) + annots = resolve1(page.attrs.get("Annots")) + if annots: + annotation_list = get_uris(annots, height, coordinate_system, page_number) for obj in page_layout: x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) @@ -560,10 +539,11 @@ def _get_pdf_page_number( file: Optional[bytes | IO[bytes]] = None, ) -> int: if file: - number_of_pages = PdfReader(file).get_num_pages() + number_of_pages = len(playa.Document(file).pages) file.seek(0) elif filename: - number_of_pages = PdfReader(filename).get_num_pages() + with playa.open(filename) as pdf: + number_of_pages = len(pdf.pages) else: raise ValueError("Either 'file' or 'filename' must be provided.") return number_of_pages @@ -1030,19 +1010,6 @@ def _extract_text(item: LTItem) -> str: return "\n" -# Some pages with a ICC color space do not follow the pdf spec -# They throw an error when we call interpreter.process_page -# Since we don't need color info, we can just drop it in the pdfminer code -# See #2059 -@wrapt.patch_function_wrapper("pdfminer.pdfinterp", "PDFPageInterpreter.init_resources") -def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs): - resources = args[0] - if "ColorSpace" in resources: - del resources["ColorSpace"] - - return wrapped(resources) - - def _combine_list_elements( elements: list[Element], coordinate_system: PixelSpace | PointSpace ) -> list[Element]: diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 8941d5022b..7c56e1e83a 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -1,12 +1,10 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast +from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union import numpy as np -from pdfminer.layout import LTChar, LTTextBox -from pdfminer.pdftypes import PDFObjRef -from pdfminer.utils import open_filename +from paves.miner import LTChar, LTTextBox, PDFObjRef, resolve1 from unstructured_inference.config import inference_config from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD from unstructured_inference.inference.elements import Rectangle @@ -43,12 +41,11 @@ def process_file_with_pdfminer( password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None, ) -> tuple[List[List["TextRegion"]], List[List]]: - with open_filename(filename, "rb") as fp: - fp = cast(BinaryIO, fp) - extracted_layout, layouts_links = process_data_with_pdfminer( - file=fp, dpi=dpi, password=password, pdfminer_config=pdfminer_config - ) - return extracted_layout, layouts_links + + extracted_layout, layouts_links = process_data_with_pdfminer( + file=None, filename=filename, dpi=dpi, password=password, pdfminer_config=pdfminer_config + ) + return extracted_layout, layouts_links def _validate_bbox(bbox: list[int | float]) -> bool: @@ -434,6 +431,7 @@ def process_page_layout_from_pdfminer( @requires_dependencies("unstructured_inference") def process_data_with_pdfminer( file: Optional[Union[bytes, BinaryIO]] = None, + filename: Optional[str] = None, dpi: int = 200, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None, @@ -448,7 +446,9 @@ def process_data_with_pdfminer( # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(file, password=password, pdfminer_config=pdfminer_config) + open_pdfminer_pages_generator( + file, filename, password=password, pdfminer_config=pdfminer_config + ) ): width, height = page_layout.width, page_layout.height @@ -457,8 +457,9 @@ def process_data_with_pdfminer( width=width, height=height, ) - if page.annots: - annotation_list = get_uris(page.annots, height, coordinate_system, page_number) + annots = resolve1(page.attrs.get("Annots")) + if annots: + annotation_list = get_uris(annots, height, coordinate_system, page_number) layout, urls_metadata = process_page_layout_from_pdfminer( annotation_list, page_layout, height, page_number, coef diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 3993f41ae0..eaee1a5baa 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,15 +1,10 @@ import os -import tempfile from typing import BinaryIO, List, Optional, Tuple -from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine -from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager -from pdfminer.pdfpage import PDFPage -from pdfminer.psexceptions import PSSyntaxError +import playa +from paves.miner import LAParams, LTContainer, LTImage, LTItem, LTTextLine, extract_page from pydantic import BaseModel -from unstructured.logger import logger from unstructured.utils import requires_dependencies @@ -20,18 +15,6 @@ class PDFMinerConfig(BaseModel): char_margin: Optional[float] = None -def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None): - rsrcmgr = PDFResourceManager() - - laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {} - laparams = LAParams(**laparams_kwargs) - - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - - return device, interpreter - - def extract_image_objects(parent_object: LTItem) -> List[LTImage]: """Recursively extracts image objects from a given parent object in a PDF document.""" objects = [] @@ -81,47 +64,29 @@ def rect_to_bbox( return (x1, y1, x2, y2) -@requires_dependencies(["pikepdf", "pypdf"]) +@requires_dependencies("paves") def open_pdfminer_pages_generator( - fp: BinaryIO, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None + fp: Optional[BinaryIO] = None, + filename: Optional[str] = None, + password: Optional[str] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, ): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" - - import pikepdf - - from unstructured.partition.pdf_image.pypdf_utils import get_page_data - - device, interpreter = init_pdfminer(pdfminer_config=pdfminer_config) - with tempfile.TemporaryDirectory() as tmp_dir_path: - tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") - try: - pages = PDFPage.get_pages(fp, password=password or "") - # Detect invalid dictionary construct for entire PDF - for i, page in enumerate(pages): - try: - # Detect invalid dictionary construct for one page - interpreter.process_page(page) - page_layout = device.get_result() - except PSSyntaxError: - logger.info("Detected invalid dictionary construct for PDFminer") - logger.info(f"Repairing the PDF page {i + 1} ...") - # find the error page from binary data fp - error_page_data = get_page_data(fp, page_number=i) - # repair the error page with pikepdf - with pikepdf.Pdf.open(error_page_data) as pdf: - pdf.save(tmp_file_path) - page = next(PDFPage.get_pages(open(tmp_file_path, "rb"))) # noqa: SIM115 - interpreter.process_page(page) - page_layout = device.get_result() - yield page, page_layout - except PSSyntaxError: - logger.info("Detected invalid dictionary construct for PDFminer") - logger.info("Repairing the PDF document ...") - # repair the entire doc with pikepdf - with pikepdf.Pdf.open(fp) as pdf: - pdf.save(tmp_file_path) - pages = PDFPage.get_pages(open(tmp_file_path, "rb")) # noqa: SIM115 - for page in pages: - interpreter.process_page(page) - page_layout = device.get_result() - yield page, page_layout + laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {} + laparams = LAParams(**laparams_kwargs) + if password is None: + password = "" # playa's default + + if fp is None: + from functools import partial + + assert filename + with playa.open( + filename, space="page", password=password, max_workers=min(1, os.cpu_count() // 2) + ) as doc: + yield from zip(doc.pages, doc.pages.map(partial(extract_page, laparams=laparams))) + else: + doc = playa.Document(fp, space="page", password=password) + for page in doc.pages: + page_layout = extract_page(page, laparams) + yield page, page_layout diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py deleted file mode 100644 index cc0c7dab21..0000000000 --- a/unstructured/patches/pdfminer.py +++ /dev/null @@ -1,76 +0,0 @@ -import functools -from typing import Tuple, Union - -import pdfminer -from pdfminer.psparser import ( - END_KEYWORD, - KWD, - PSEOF, - PSBaseParser, - PSBaseParserToken, - PSKeyword, - log, -) - -factory_seek = PSBaseParser.seek - - -@functools.wraps(PSBaseParser.seek) -def seek(self: PSBaseParser, pos: int) -> None: - factory_seek(self, pos) - self.eof = False - - -@functools.wraps(PSBaseParser._parse_keyword) -def _parse_keyword(self, s: bytes, i: int) -> int: - m = END_KEYWORD.search(s, i) - if m: - j = m.start(0) - self._curtoken += s[i:j] - else: - self._curtoken += s[i:] - return len(s) - if self._curtoken == b"true": - token: Union[bool, PSKeyword] = True - elif self._curtoken == b"false": - token = False - else: - token = KWD(self._curtoken) - self._add_token(token) - self._parse1 = self._parse_main - return j - - -@functools.wraps(PSBaseParser.nexttoken) -def nexttoken(self) -> Tuple[int, PSBaseParserToken]: - if self.eof: - # It's not really unexpected, come on now... - raise PSEOF("Unexpected EOF") - while not self._tokens: - try: - self.fillbuf() - self.charpos = self._parse1(self.buf, self.charpos) - except PSEOF: - # If we hit EOF in the middle of a token, try to parse - # it by tacking on whitespace, and delay raising PSEOF - # until next time around - self.charpos = self._parse1(b"\n", 0) - self.eof = True - # Oh, so there wasn't actually a token there? OK. - if not self._tokens: - raise - token = self._tokens.pop(0) - log.debug("nexttoken: %r", token) - return token - - -def patch_psparser(): - """Monkey-patch certain versions of pdfminer.six to avoid dropping - tokens at EOF (before 20231228) and splitting tokens at buffer - boundaries (20231228 and 20240706). - """ - # Presuming the bug will be fixed in the next release - if pdfminer.__version__ <= "20240706": - PSBaseParser.seek = seek - PSBaseParser._parse_keyword = _parse_keyword - PSBaseParser.nexttoken = nexttoken