diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7d2d8950c4..416911c440 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
## 0.18.11-dev0
### Enhancements
+- **Switch from pdfminer.six to PAVÉS** Increases robustness of PDF extraction and uses multiple CPUs when possible. No more need to patch pdfminer or repair pdfs with pikepdf.
### Features
diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
index b0caffbb95..b4a11ab4a0 100644
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@@ -4,8 +4,8 @@
onnx>=1.17.0
onnxruntime>=1.19.0
pdf2image
-pdfminer.six
-pikepdf
+paves
+playa-pdf>=0.6.2
pi_heif
pypdf
google-cloud-vision
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
index 4d2ef23532..6518deb748 100644
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@@ -33,8 +33,6 @@ cryptography==45.0.5
# pdfminer-six
cycler==0.12.1
# via matplotlib
-deprecated==1.2.18
- # via pikepdf
effdet==0.4.1
# via -r ./extra-pdf-image.in
filelock==3.18.0
@@ -88,10 +86,6 @@ jinja2==3.1.6
# via torch
kiwisolver==1.4.8
# via matplotlib
-lxml==6.0.0
- # via
- # -c requirements/base.txt
- # pikepdf
markupsafe==3.0.2
# via jinja2
matplotlib==3.10.3
@@ -134,30 +128,30 @@ packaging==25.0
# huggingface-hub
# matplotlib
# onnxruntime
- # pikepdf
# transformers
# unstructured-pytesseract
pandas==2.3.1
# via unstructured-inference
+paves==0.6.1
+ # via -r extra-pdf-image.in
pdf2image==1.17.0
- # via -r ./extra-pdf-image.in
+ # via -r extra-pdf-image.in
pdfminer-six==20250327
# via
- # -c requirements/deps/constraints.txt
- # -r ./extra-pdf-image.in
+ # -c ./deps/constraints.txt
# unstructured-inference
pi-heif==1.0.0
- # via -r ./extra-pdf-image.in
-pikepdf==9.9.0
- # via -r ./extra-pdf-image.in
+ # via -r extra-pdf-image.in
pillow==11.3.0
# via
# matplotlib
+ # paves
# pdf2image
# pi-heif
- # pikepdf
# torchvision
# unstructured-pytesseract
+playa-pdf==0.6.2
+ # via paves
proto-plus==1.26.1
# via
# google-api-core
@@ -274,7 +268,6 @@ typing-extensions==4.14.1
# -c requirements/base.txt
# huggingface-hub
# onnx
- # pypdf
# torch
tzdata==2025.2
# via pandas
@@ -287,7 +280,3 @@ urllib3==2.5.0
# -c requirements/base.txt
# -c requirements/deps/constraints.txt
# requests
-wrapt==1.17.2
- # via
- # -c requirements/base.txt
- # deprecated
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 919ac89619..a50471de75 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1224,14 +1224,14 @@ def test_partition_pdf_with_fast_finds_headers_footers():
@pytest.mark.parametrize(
("filename", "expected_log"),
[
- # This one is *actually* an invalid PDF document
+ # This one is *actually* an invalid PDF document, but we no longer need to repair it
("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
],
)
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
caplog.set_level(logging.INFO)
assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
- assert expected_log in caplog.text
+ assert expected_log not in caplog.text
@pytest.mark.parametrize(
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
index 309ea1336f..3f4a7c4f07 100644
--- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -2,7 +2,7 @@
import numpy as np
import pytest
-from pdfminer.layout import LAParams
+from paves.miner import LAParams
from PIL import Image
from unstructured_inference.constants import Source as InferenceSource
from unstructured_inference.inference.elements import (
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
index 075a4e151e..2effe7eb75 100644
--- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
@@ -1,6 +1,6 @@
from unittest.mock import MagicMock
-from pdfminer.layout import LTContainer, LTTextLine
+from paves.miner import LTContainer, LTTextLine
from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
index 94b12d5578..d2c0a1ce5f 100644
--- a/test_unstructured/partition/test_msg.py
+++ b/test_unstructured/partition/test_msg.py
@@ -125,8 +125,8 @@ def test_partition_msg_can_process_attachments():
assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5])
assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7])
- assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343])
- assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:])
+ assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:341])
+ assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[341:])
assert [e.text for e in elements[:5]] == [
"Here are those documents.",
"--",
diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
index c780303b30..81f092abee 100644
--- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
+++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
@@ -58,12 +58,12 @@
In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our final solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply fine-tuning the question and passage encoders on existing question-passage pairs is sufficient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.
-
+
2 Background
The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume
-the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
+the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
e
diff --git a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
index 8e74e6d944..c64ab3495e 100644
--- a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
+++ b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
@@ -43,7 +43,7 @@ In this paper, we address the question: can we train a better dense embedding mo
The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume
-the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
+the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
e
3 Dense Passage Retriever (DPR)
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
index 829b9b7a7e..40c36de858 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
@@ -111,8 +111,8 @@
},
{
"type": "CompositeElement",
- "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e",
- "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne",
+ "element_id": "c2959a06eb5a6864c4f0c7d38e21b2e9",
+ "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne",
"metadata": {
"data_source": {
"record_locator": {
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 0efe69ed03..6849b435af 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -10,12 +10,10 @@
from typing import IO, TYPE_CHECKING, Any, Optional, cast
import numpy as np
-import wrapt
-from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
-from pdfminer.utils import open_filename
+import playa
+from paves.miner import LTContainer, LTImage, LTItem, LTTextBox, resolve1
from pi_heif import register_heif_opener
from PIL import Image as PILImage
-from pypdf import PdfReader
from unstructured_inference.inference.layout import DocumentLayout
from unstructured_inference.inference.layoutelement import LayoutElement
@@ -93,19 +91,12 @@
PartitionStrategy,
)
from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
-from unstructured.patches.pdfminer import patch_psparser
from unstructured.utils import first, requires_dependencies
if TYPE_CHECKING:
pass
-# Correct a bug that was introduced by a previous patch to
-# pdfminer.six, causing needless and unsuccessful repairing of PDFs
-# which were not actually broken.
-patch_psparser()
-
-
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
@@ -439,38 +430,23 @@ def _partition_pdf_with_pdfminer(
"""
exactly_one(filename=filename, file=file)
- if filename:
- with open_filename(filename, "rb") as fp:
- fp = cast(IO[bytes], fp)
- elements = _process_pdfminer_pages(
- fp=fp,
- filename=filename,
- languages=languages,
- metadata_last_modified=metadata_last_modified,
- starting_page_number=starting_page_number,
- password=password,
- pdfminer_config=pdfminer_config,
- **kwargs,
- )
-
- elif file:
- elements = _process_pdfminer_pages(
- fp=file,
- filename=filename,
- languages=languages,
- metadata_last_modified=metadata_last_modified,
- starting_page_number=starting_page_number,
- password=password,
- pdfminer_config=pdfminer_config,
- **kwargs,
- )
+ elements = _process_pdfminer_pages(
+ fp=file,
+ filename=filename,
+ languages=languages,
+ metadata_last_modified=metadata_last_modified,
+ starting_page_number=starting_page_number,
+ password=password,
+ pdfminer_config=pdfminer_config,
+ **kwargs,
+ )
return elements
-@requires_dependencies("pdfminer")
+@requires_dependencies("paves")
def _process_pdfminer_pages(
- fp: IO[bytes],
+ fp: Optional[IO[bytes]],
filename: str,
metadata_last_modified: Optional[str],
languages: Optional[list[str]] = None,
@@ -485,7 +461,9 @@ def _process_pdfminer_pages(
elements = []
for page_number, (page, page_layout) in enumerate(
- open_pdfminer_pages_generator(fp, password=password, pdfminer_config=pdfminer_config),
+ open_pdfminer_pages_generator(
+ fp, filename, password=password, pdfminer_config=pdfminer_config
+ ),
start=starting_page_number,
):
width, height = page_layout.width, page_layout.height
@@ -497,8 +475,9 @@ def _process_pdfminer_pages(
width=width,
height=height,
)
- if page.annots:
- annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
+ annots = resolve1(page.attrs.get("Annots"))
+ if annots:
+ annotation_list = get_uris(annots, height, coordinate_system, page_number)
for obj in page_layout:
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
@@ -560,10 +539,11 @@ def _get_pdf_page_number(
file: Optional[bytes | IO[bytes]] = None,
) -> int:
if file:
- number_of_pages = PdfReader(file).get_num_pages()
+ number_of_pages = len(playa.Document(file).pages)
file.seek(0)
elif filename:
- number_of_pages = PdfReader(filename).get_num_pages()
+ with playa.open(filename) as pdf:
+ number_of_pages = len(pdf.pages)
else:
raise ValueError("Either 'file' or 'filename' must be provided.")
return number_of_pages
@@ -1030,19 +1010,6 @@ def _extract_text(item: LTItem) -> str:
return "\n"
-# Some pages with a ICC color space do not follow the pdf spec
-# They throw an error when we call interpreter.process_page
-# Since we don't need color info, we can just drop it in the pdfminer code
-# See #2059
-@wrapt.patch_function_wrapper("pdfminer.pdfinterp", "PDFPageInterpreter.init_resources")
-def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
- resources = args[0]
- if "ColorSpace" in resources:
- del resources["ColorSpace"]
-
- return wrapped(resources)
-
-
def _combine_list_elements(
elements: list[Element], coordinate_system: PixelSpace | PointSpace
) -> list[Element]:
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index 8941d5022b..7c56e1e83a 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -1,12 +1,10 @@
from __future__ import annotations
import os
-from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union
import numpy as np
-from pdfminer.layout import LTChar, LTTextBox
-from pdfminer.pdftypes import PDFObjRef
-from pdfminer.utils import open_filename
+from paves.miner import LTChar, LTTextBox, PDFObjRef, resolve1
from unstructured_inference.config import inference_config
from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD
from unstructured_inference.inference.elements import Rectangle
@@ -43,12 +41,11 @@ def process_file_with_pdfminer(
password: Optional[str] = None,
pdfminer_config: Optional[PDFMinerConfig] = None,
) -> tuple[List[List["TextRegion"]], List[List]]:
- with open_filename(filename, "rb") as fp:
- fp = cast(BinaryIO, fp)
- extracted_layout, layouts_links = process_data_with_pdfminer(
- file=fp, dpi=dpi, password=password, pdfminer_config=pdfminer_config
- )
- return extracted_layout, layouts_links
+
+ extracted_layout, layouts_links = process_data_with_pdfminer(
+ file=None, filename=filename, dpi=dpi, password=password, pdfminer_config=pdfminer_config
+ )
+ return extracted_layout, layouts_links
def _validate_bbox(bbox: list[int | float]) -> bool:
@@ -434,6 +431,7 @@ def process_page_layout_from_pdfminer(
@requires_dependencies("unstructured_inference")
def process_data_with_pdfminer(
file: Optional[Union[bytes, BinaryIO]] = None,
+ filename: Optional[str] = None,
dpi: int = 200,
password: Optional[str] = None,
pdfminer_config: Optional[PDFMinerConfig] = None,
@@ -448,7 +446,9 @@ def process_data_with_pdfminer(
# Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72
for page_number, (page, page_layout) in enumerate(
- open_pdfminer_pages_generator(file, password=password, pdfminer_config=pdfminer_config)
+ open_pdfminer_pages_generator(
+ file, filename, password=password, pdfminer_config=pdfminer_config
+ )
):
width, height = page_layout.width, page_layout.height
@@ -457,8 +457,9 @@ def process_data_with_pdfminer(
width=width,
height=height,
)
- if page.annots:
- annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
+ annots = resolve1(page.attrs.get("Annots"))
+ if annots:
+ annotation_list = get_uris(annots, height, coordinate_system, page_number)
layout, urls_metadata = process_page_layout_from_pdfminer(
annotation_list, page_layout, height, page_number, coef
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
index 3993f41ae0..eaee1a5baa 100644
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -1,15 +1,10 @@
import os
-import tempfile
from typing import BinaryIO, List, Optional, Tuple
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
-from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
-from pdfminer.pdfpage import PDFPage
-from pdfminer.psexceptions import PSSyntaxError
+import playa
+from paves.miner import LAParams, LTContainer, LTImage, LTItem, LTTextLine, extract_page
from pydantic import BaseModel
-from unstructured.logger import logger
from unstructured.utils import requires_dependencies
@@ -20,18 +15,6 @@ class PDFMinerConfig(BaseModel):
char_margin: Optional[float] = None
-def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None):
- rsrcmgr = PDFResourceManager()
-
- laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {}
- laparams = LAParams(**laparams_kwargs)
-
- device = PDFPageAggregator(rsrcmgr, laparams=laparams)
- interpreter = PDFPageInterpreter(rsrcmgr, device)
-
- return device, interpreter
-
-
def extract_image_objects(parent_object: LTItem) -> List[LTImage]:
"""Recursively extracts image objects from a given parent object in a PDF document."""
objects = []
@@ -81,47 +64,29 @@ def rect_to_bbox(
return (x1, y1, x2, y2)
-@requires_dependencies(["pikepdf", "pypdf"])
+@requires_dependencies("paves")
def open_pdfminer_pages_generator(
- fp: BinaryIO, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None
+ fp: Optional[BinaryIO] = None,
+ filename: Optional[str] = None,
+ password: Optional[str] = None,
+ pdfminer_config: Optional[PDFMinerConfig] = None,
):
"""Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
-
- import pikepdf
-
- from unstructured.partition.pdf_image.pypdf_utils import get_page_data
-
- device, interpreter = init_pdfminer(pdfminer_config=pdfminer_config)
- with tempfile.TemporaryDirectory() as tmp_dir_path:
- tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
- try:
- pages = PDFPage.get_pages(fp, password=password or "")
- # Detect invalid dictionary construct for entire PDF
- for i, page in enumerate(pages):
- try:
- # Detect invalid dictionary construct for one page
- interpreter.process_page(page)
- page_layout = device.get_result()
- except PSSyntaxError:
- logger.info("Detected invalid dictionary construct for PDFminer")
- logger.info(f"Repairing the PDF page {i + 1} ...")
- # find the error page from binary data fp
- error_page_data = get_page_data(fp, page_number=i)
- # repair the error page with pikepdf
- with pikepdf.Pdf.open(error_page_data) as pdf:
- pdf.save(tmp_file_path)
- page = next(PDFPage.get_pages(open(tmp_file_path, "rb"))) # noqa: SIM115
- interpreter.process_page(page)
- page_layout = device.get_result()
- yield page, page_layout
- except PSSyntaxError:
- logger.info("Detected invalid dictionary construct for PDFminer")
- logger.info("Repairing the PDF document ...")
- # repair the entire doc with pikepdf
- with pikepdf.Pdf.open(fp) as pdf:
- pdf.save(tmp_file_path)
- pages = PDFPage.get_pages(open(tmp_file_path, "rb")) # noqa: SIM115
- for page in pages:
- interpreter.process_page(page)
- page_layout = device.get_result()
- yield page, page_layout
+ laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {}
+ laparams = LAParams(**laparams_kwargs)
+ if password is None:
+ password = "" # playa's default
+
+ if fp is None:
+ from functools import partial
+
+ assert filename
+ with playa.open(
+ filename, space="page", password=password, max_workers=min(1, os.cpu_count() // 2)
+ ) as doc:
+ yield from zip(doc.pages, doc.pages.map(partial(extract_page, laparams=laparams)))
+ else:
+ doc = playa.Document(fp, space="page", password=password)
+ for page in doc.pages:
+ page_layout = extract_page(page, laparams)
+ yield page, page_layout
diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py
deleted file mode 100644
index cc0c7dab21..0000000000
--- a/unstructured/patches/pdfminer.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import functools
-from typing import Tuple, Union
-
-import pdfminer
-from pdfminer.psparser import (
- END_KEYWORD,
- KWD,
- PSEOF,
- PSBaseParser,
- PSBaseParserToken,
- PSKeyword,
- log,
-)
-
-factory_seek = PSBaseParser.seek
-
-
-@functools.wraps(PSBaseParser.seek)
-def seek(self: PSBaseParser, pos: int) -> None:
- factory_seek(self, pos)
- self.eof = False
-
-
-@functools.wraps(PSBaseParser._parse_keyword)
-def _parse_keyword(self, s: bytes, i: int) -> int:
- m = END_KEYWORD.search(s, i)
- if m:
- j = m.start(0)
- self._curtoken += s[i:j]
- else:
- self._curtoken += s[i:]
- return len(s)
- if self._curtoken == b"true":
- token: Union[bool, PSKeyword] = True
- elif self._curtoken == b"false":
- token = False
- else:
- token = KWD(self._curtoken)
- self._add_token(token)
- self._parse1 = self._parse_main
- return j
-
-
-@functools.wraps(PSBaseParser.nexttoken)
-def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
- if self.eof:
- # It's not really unexpected, come on now...
- raise PSEOF("Unexpected EOF")
- while not self._tokens:
- try:
- self.fillbuf()
- self.charpos = self._parse1(self.buf, self.charpos)
- except PSEOF:
- # If we hit EOF in the middle of a token, try to parse
- # it by tacking on whitespace, and delay raising PSEOF
- # until next time around
- self.charpos = self._parse1(b"\n", 0)
- self.eof = True
- # Oh, so there wasn't actually a token there? OK.
- if not self._tokens:
- raise
- token = self._tokens.pop(0)
- log.debug("nexttoken: %r", token)
- return token
-
-
-def patch_psparser():
- """Monkey-patch certain versions of pdfminer.six to avoid dropping
- tokens at EOF (before 20231228) and splitting tokens at buffer
- boundaries (20231228 and 20240706).
- """
- # Presuming the bug will be fixed in the next release
- if pdfminer.__version__ <= "20240706":
- PSBaseParser.seek = seek
- PSBaseParser._parse_keyword = _parse_keyword
- PSBaseParser.nexttoken = nexttoken