diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7d2d8950c4..416911c440 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## 0.18.11-dev0
 
 ### Enhancements
+- **Switch from pdfminer.six to PAVÉS** Increases robustness of PDF extraction and uses multiple CPUs when possible.  No more need to patch pdfminer or repair pdfs with pikepdf.
 
 ### Features
 
diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
index b0caffbb95..b4a11ab4a0 100644
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@@ -4,8 +4,8 @@
 onnx>=1.17.0
 onnxruntime>=1.19.0
 pdf2image
-pdfminer.six
-pikepdf
+paves
+playa-pdf>=0.6.2
 pi_heif
 pypdf
 google-cloud-vision
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
index 4d2ef23532..6518deb748 100644
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@@ -33,8 +33,6 @@ cryptography==45.0.5
     #   pdfminer-six
 cycler==0.12.1
     # via matplotlib
-deprecated==1.2.18
-    # via pikepdf
 effdet==0.4.1
     # via -r ./extra-pdf-image.in
 filelock==3.18.0
@@ -88,10 +86,6 @@ jinja2==3.1.6
     # via torch
 kiwisolver==1.4.8
     # via matplotlib
-lxml==6.0.0
-    # via
-    #   -c requirements/base.txt
-    #   pikepdf
 markupsafe==3.0.2
     # via jinja2
 matplotlib==3.10.3
@@ -134,30 +128,30 @@ packaging==25.0
     #   huggingface-hub
     #   matplotlib
     #   onnxruntime
-    #   pikepdf
     #   transformers
     #   unstructured-pytesseract
 pandas==2.3.1
     # via unstructured-inference
+paves==0.6.1
+    # via -r extra-pdf-image.in
 pdf2image==1.17.0
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 pdfminer-six==20250327
     # via
-    #   -c requirements/deps/constraints.txt
-    #   -r ./extra-pdf-image.in
+    #   -c ./deps/constraints.txt
     #   unstructured-inference
 pi-heif==1.0.0
-    # via -r ./extra-pdf-image.in
-pikepdf==9.9.0
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 pillow==11.3.0
     # via
     #   matplotlib
+    #   paves
     #   pdf2image
     #   pi-heif
-    #   pikepdf
     #   torchvision
     #   unstructured-pytesseract
+playa-pdf==0.6.2
+    # via paves
 proto-plus==1.26.1
     # via
     #   google-api-core
@@ -274,7 +268,6 @@ typing-extensions==4.14.1
     #   -c requirements/base.txt
     #   huggingface-hub
     #   onnx
-    #   pypdf
     #   torch
 tzdata==2025.2
     # via pandas
@@ -287,7 +280,3 @@ urllib3==2.5.0
     #   -c requirements/base.txt
     #   -c requirements/deps/constraints.txt
     #   requests
-wrapt==1.17.2
-    # via
-    #   -c requirements/base.txt
-    #   deprecated
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 919ac89619..a50471de75 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1224,14 +1224,14 @@ def test_partition_pdf_with_fast_finds_headers_footers():
 @pytest.mark.parametrize(
     ("filename", "expected_log"),
     [
-        # This one is *actually* an invalid PDF document
+        # This one is *actually* an invalid PDF document, but we no longer need to repair it
         ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
     ],
 )
 def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
     caplog.set_level(logging.INFO)
     assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
-    assert expected_log in caplog.text
+    assert expected_log not in caplog.text
 
 
 @pytest.mark.parametrize(
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
index 309ea1336f..3f4a7c4f07 100644
--- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pytest
-from pdfminer.layout import LAParams
+from paves.miner import LAParams
 from PIL import Image
 from unstructured_inference.constants import Source as InferenceSource
 from unstructured_inference.inference.elements import (
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
index 075a4e151e..2effe7eb75 100644
--- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
@@ -1,6 +1,6 @@
 from unittest.mock import MagicMock
 
-from pdfminer.layout import LTContainer, LTTextLine
+from paves.miner import LTContainer, LTTextLine
 
 from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
 
diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
index 94b12d5578..d2c0a1ce5f 100644
--- a/test_unstructured/partition/test_msg.py
+++ b/test_unstructured/partition/test_msg.py
@@ -125,8 +125,8 @@ def test_partition_msg_can_process_attachments():
 
     assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5])
     assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7])
-    assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343])
-    assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:])
+    assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:341])
+    assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[341:])
     assert [e.text for e in elements[:5]] == [
         "Here are those documents.",
         "--",
diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
index c780303b30..81f092abee 100644
--- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
+++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
@@ -58,12 +58,12 @@
   <div class="CompositeElement" id="4204154eefaa843f79edc96dcc208054">
    In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our ﬁnal solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply ﬁne-tuning the question and passage encoders on existing question-passage pairs is sufﬁcient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.
   </div>
-  <div class="CompositeElement" id="e6dee1abec28f8ff365ab6275b3e5f0e">
+  <div class="CompositeElement" id="c2959a06eb5a6864c4f0c7d38e21b2e9">
    2 Background
 
 The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume
 
-the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
+the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
 
 e
   </div>
diff --git a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
index 8e74e6d944..c64ab3495e 100644
--- a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
+++ b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
@@ -43,7 +43,7 @@ In this paper, we address the question: can we train a better dense embedding mo
 
 The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume
 
-the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
+the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
 
 e
 3 Dense Passage Retriever (DPR)
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
index 829b9b7a7e..40c36de858 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
@@ -111,8 +111,8 @@
   },
   {
     "type": "CompositeElement",
-    "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e",
-    "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne",
+    "element_id": "c2959a06eb5a6864c4f0c7d38e21b2e9",
+    "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne",
     "metadata": {
       "data_source": {
         "record_locator": {
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 0efe69ed03..6849b435af 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -10,12 +10,10 @@
 from typing import IO, TYPE_CHECKING, Any, Optional, cast
 
 import numpy as np
-import wrapt
-from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
-from pdfminer.utils import open_filename
+import playa
+from paves.miner import LTContainer, LTImage, LTItem, LTTextBox, resolve1
 from pi_heif import register_heif_opener
 from PIL import Image as PILImage
-from pypdf import PdfReader
 from unstructured_inference.inference.layout import DocumentLayout
 from unstructured_inference.inference.layoutelement import LayoutElement
 
@@ -93,19 +91,12 @@
     PartitionStrategy,
 )
 from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
-from unstructured.patches.pdfminer import patch_psparser
 from unstructured.utils import first, requires_dependencies
 
 if TYPE_CHECKING:
     pass
 
 
-# Correct a bug that was introduced by a previous patch to
-# pdfminer.six, causing needless and unsuccessful repairing of PDFs
-# which were not actually broken.
-patch_psparser()
-
-
 RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
 
 
@@ -439,38 +430,23 @@ def _partition_pdf_with_pdfminer(
     """
 
     exactly_one(filename=filename, file=file)
-    if filename:
-        with open_filename(filename, "rb") as fp:
-            fp = cast(IO[bytes], fp)
-            elements = _process_pdfminer_pages(
-                fp=fp,
-                filename=filename,
-                languages=languages,
-                metadata_last_modified=metadata_last_modified,
-                starting_page_number=starting_page_number,
-                password=password,
-                pdfminer_config=pdfminer_config,
-                **kwargs,
-            )
-
-    elif file:
-        elements = _process_pdfminer_pages(
-            fp=file,
-            filename=filename,
-            languages=languages,
-            metadata_last_modified=metadata_last_modified,
-            starting_page_number=starting_page_number,
-            password=password,
-            pdfminer_config=pdfminer_config,
-            **kwargs,
-        )
+    elements = _process_pdfminer_pages(
+        fp=file,
+        filename=filename,
+        languages=languages,
+        metadata_last_modified=metadata_last_modified,
+        starting_page_number=starting_page_number,
+        password=password,
+        pdfminer_config=pdfminer_config,
+        **kwargs,
+    )
 
     return elements
 
 
-@requires_dependencies("pdfminer")
+@requires_dependencies("paves")
 def _process_pdfminer_pages(
-    fp: IO[bytes],
+    fp: Optional[IO[bytes]],
     filename: str,
     metadata_last_modified: Optional[str],
     languages: Optional[list[str]] = None,
@@ -485,7 +461,9 @@ def _process_pdfminer_pages(
     elements = []
 
     for page_number, (page, page_layout) in enumerate(
-        open_pdfminer_pages_generator(fp, password=password, pdfminer_config=pdfminer_config),
+        open_pdfminer_pages_generator(
+            fp, filename, password=password, pdfminer_config=pdfminer_config
+        ),
         start=starting_page_number,
     ):
         width, height = page_layout.width, page_layout.height
@@ -497,8 +475,9 @@ def _process_pdfminer_pages(
             width=width,
             height=height,
         )
-        if page.annots:
-            annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
+        annots = resolve1(page.attrs.get("Annots"))
+        if annots:
+            annotation_list = get_uris(annots, height, coordinate_system, page_number)
 
         for obj in page_layout:
             x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
@@ -560,10 +539,11 @@ def _get_pdf_page_number(
     file: Optional[bytes | IO[bytes]] = None,
 ) -> int:
     if file:
-        number_of_pages = PdfReader(file).get_num_pages()
+        number_of_pages = len(playa.Document(file).pages)
         file.seek(0)
     elif filename:
-        number_of_pages = PdfReader(filename).get_num_pages()
+        with playa.open(filename) as pdf:
+            number_of_pages = len(pdf.pages)
     else:
         raise ValueError("Either 'file' or 'filename' must be provided.")
     return number_of_pages
@@ -1030,19 +1010,6 @@ def _extract_text(item: LTItem) -> str:
     return "\n"
 
 
-# Some pages with a ICC color space do not follow the pdf spec
-# They throw an error when we call interpreter.process_page
-# Since we don't need color info, we can just drop it in the pdfminer code
-# See #2059
-@wrapt.patch_function_wrapper("pdfminer.pdfinterp", "PDFPageInterpreter.init_resources")
-def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
-    resources = args[0]
-    if "ColorSpace" in resources:
-        del resources["ColorSpace"]
-
-    return wrapped(resources)
-
-
 def _combine_list_elements(
     elements: list[Element], coordinate_system: PixelSpace | PointSpace
 ) -> list[Element]:
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index 8941d5022b..7c56e1e83a 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -1,12 +1,10 @@
 from __future__ import annotations
 
 import os
-from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union
 
 import numpy as np
-from pdfminer.layout import LTChar, LTTextBox
-from pdfminer.pdftypes import PDFObjRef
-from pdfminer.utils import open_filename
+from paves.miner import LTChar, LTTextBox, PDFObjRef, resolve1
 from unstructured_inference.config import inference_config
 from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD
 from unstructured_inference.inference.elements import Rectangle
@@ -43,12 +41,11 @@ def process_file_with_pdfminer(
     password: Optional[str] = None,
     pdfminer_config: Optional[PDFMinerConfig] = None,
 ) -> tuple[List[List["TextRegion"]], List[List]]:
-    with open_filename(filename, "rb") as fp:
-        fp = cast(BinaryIO, fp)
-        extracted_layout, layouts_links = process_data_with_pdfminer(
-            file=fp, dpi=dpi, password=password, pdfminer_config=pdfminer_config
-        )
-        return extracted_layout, layouts_links
+
+    extracted_layout, layouts_links = process_data_with_pdfminer(
+        file=None, filename=filename, dpi=dpi, password=password, pdfminer_config=pdfminer_config
+    )
+    return extracted_layout, layouts_links
 
 
 def _validate_bbox(bbox: list[int | float]) -> bool:
@@ -434,6 +431,7 @@ def process_page_layout_from_pdfminer(
 @requires_dependencies("unstructured_inference")
 def process_data_with_pdfminer(
     file: Optional[Union[bytes, BinaryIO]] = None,
+    filename: Optional[str] = None,
     dpi: int = 200,
     password: Optional[str] = None,
     pdfminer_config: Optional[PDFMinerConfig] = None,
@@ -448,7 +446,9 @@ def process_data_with_pdfminer(
     # Coefficient to rescale bounding box to be compatible with images
     coef = dpi / 72
     for page_number, (page, page_layout) in enumerate(
-        open_pdfminer_pages_generator(file, password=password, pdfminer_config=pdfminer_config)
+        open_pdfminer_pages_generator(
+            file, filename, password=password, pdfminer_config=pdfminer_config
+        )
     ):
         width, height = page_layout.width, page_layout.height
 
@@ -457,8 +457,9 @@ def process_data_with_pdfminer(
             width=width,
             height=height,
         )
-        if page.annots:
-            annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
+        annots = resolve1(page.attrs.get("Annots"))
+        if annots:
+            annotation_list = get_uris(annots, height, coordinate_system, page_number)
 
         layout, urls_metadata = process_page_layout_from_pdfminer(
             annotation_list, page_layout, height, page_number, coef
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
index 3993f41ae0..eaee1a5baa 100644
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -1,15 +1,10 @@
 import os
-import tempfile
 from typing import BinaryIO, List, Optional, Tuple
 
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
-from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
-from pdfminer.pdfpage import PDFPage
-from pdfminer.psexceptions import PSSyntaxError
+import playa
+from paves.miner import LAParams, LTContainer, LTImage, LTItem, LTTextLine, extract_page
 from pydantic import BaseModel
 
-from unstructured.logger import logger
 from unstructured.utils import requires_dependencies
 
 
@@ -20,18 +15,6 @@ class PDFMinerConfig(BaseModel):
     char_margin: Optional[float] = None
 
 
-def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None):
-    rsrcmgr = PDFResourceManager()
-
-    laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {}
-    laparams = LAParams(**laparams_kwargs)
-
-    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
-    interpreter = PDFPageInterpreter(rsrcmgr, device)
-
-    return device, interpreter
-
-
 def extract_image_objects(parent_object: LTItem) -> List[LTImage]:
     """Recursively extracts image objects from a given parent object in a PDF document."""
     objects = []
@@ -81,47 +64,29 @@ def rect_to_bbox(
     return (x1, y1, x2, y2)
 
 
-@requires_dependencies(["pikepdf", "pypdf"])
+@requires_dependencies("paves")
 def open_pdfminer_pages_generator(
-    fp: BinaryIO, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None
+    fp: Optional[BinaryIO] = None,
+    filename: Optional[str] = None,
+    password: Optional[str] = None,
+    pdfminer_config: Optional[PDFMinerConfig] = None,
 ):
     """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
-
-    import pikepdf
-
-    from unstructured.partition.pdf_image.pypdf_utils import get_page_data
-
-    device, interpreter = init_pdfminer(pdfminer_config=pdfminer_config)
-    with tempfile.TemporaryDirectory() as tmp_dir_path:
-        tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
-        try:
-            pages = PDFPage.get_pages(fp, password=password or "")
-            # Detect invalid dictionary construct for entire PDF
-            for i, page in enumerate(pages):
-                try:
-                    # Detect invalid dictionary construct for one page
-                    interpreter.process_page(page)
-                    page_layout = device.get_result()
-                except PSSyntaxError:
-                    logger.info("Detected invalid dictionary construct for PDFminer")
-                    logger.info(f"Repairing the PDF page {i + 1} ...")
-                    # find the error page from binary data fp
-                    error_page_data = get_page_data(fp, page_number=i)
-                    # repair the error page with pikepdf
-                    with pikepdf.Pdf.open(error_page_data) as pdf:
-                        pdf.save(tmp_file_path)
-                    page = next(PDFPage.get_pages(open(tmp_file_path, "rb")))  # noqa: SIM115
-                    interpreter.process_page(page)
-                    page_layout = device.get_result()
-                yield page, page_layout
-        except PSSyntaxError:
-            logger.info("Detected invalid dictionary construct for PDFminer")
-            logger.info("Repairing the PDF document ...")
-            # repair the entire doc with pikepdf
-            with pikepdf.Pdf.open(fp) as pdf:
-                pdf.save(tmp_file_path)
-            pages = PDFPage.get_pages(open(tmp_file_path, "rb"))  # noqa: SIM115
-            for page in pages:
-                interpreter.process_page(page)
-                page_layout = device.get_result()
-                yield page, page_layout
+    laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {}
+    laparams = LAParams(**laparams_kwargs)
+    if password is None:
+        password = ""  # playa's default
+
+    if fp is None:
+        from functools import partial
+
+        assert filename
+        with playa.open(
+            filename, space="page", password=password, max_workers=min(1, os.cpu_count() // 2)
+        ) as doc:
+            yield from zip(doc.pages, doc.pages.map(partial(extract_page, laparams=laparams)))
+    else:
+        doc = playa.Document(fp, space="page", password=password)
+        for page in doc.pages:
+            page_layout = extract_page(page, laparams)
+            yield page, page_layout
diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py
deleted file mode 100644
index cc0c7dab21..0000000000
--- a/unstructured/patches/pdfminer.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import functools
-from typing import Tuple, Union
-
-import pdfminer
-from pdfminer.psparser import (
-    END_KEYWORD,
-    KWD,
-    PSEOF,
-    PSBaseParser,
-    PSBaseParserToken,
-    PSKeyword,
-    log,
-)
-
-factory_seek = PSBaseParser.seek
-
-
-@functools.wraps(PSBaseParser.seek)
-def seek(self: PSBaseParser, pos: int) -> None:
-    factory_seek(self, pos)
-    self.eof = False
-
-
-@functools.wraps(PSBaseParser._parse_keyword)
-def _parse_keyword(self, s: bytes, i: int) -> int:
-    m = END_KEYWORD.search(s, i)
-    if m:
-        j = m.start(0)
-        self._curtoken += s[i:j]
-    else:
-        self._curtoken += s[i:]
-        return len(s)
-    if self._curtoken == b"true":
-        token: Union[bool, PSKeyword] = True
-    elif self._curtoken == b"false":
-        token = False
-    else:
-        token = KWD(self._curtoken)
-    self._add_token(token)
-    self._parse1 = self._parse_main
-    return j
-
-
-@functools.wraps(PSBaseParser.nexttoken)
-def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
-    if self.eof:
-        # It's not really unexpected, come on now...
-        raise PSEOF("Unexpected EOF")
-    while not self._tokens:
-        try:
-            self.fillbuf()
-            self.charpos = self._parse1(self.buf, self.charpos)
-        except PSEOF:
-            # If we hit EOF in the middle of a token, try to parse
-            # it by tacking on whitespace, and delay raising PSEOF
-            # until next time around
-            self.charpos = self._parse1(b"\n", 0)
-            self.eof = True
-            # Oh, so there wasn't actually a token there? OK.
-            if not self._tokens:
-                raise
-    token = self._tokens.pop(0)
-    log.debug("nexttoken: %r", token)
-    return token
-
-
-def patch_psparser():
-    """Monkey-patch certain versions of pdfminer.six to avoid dropping
-    tokens at EOF (before 20231228) and splitting tokens at buffer
-    boundaries (20231228 and 20240706).
-    """
-    # Presuming the bug will be fixed in the next release
-    if pdfminer.__version__ <= "20240706":
-        PSBaseParser.seek = seek
-        PSBaseParser._parse_keyword = _parse_keyword
-        PSBaseParser.nexttoken = nexttoken