From 164b8f3506d14f1bd0c774c82aba3b77545ceec3 Mon Sep 17 00:00:00 2001
From: luis-cd <luiscrujeirasdiaz@gmail.com>
Date: Wed, 13 Aug 2025 02:08:08 +0200
Subject: [PATCH 1/3] Fix #3418: Clean remove_page removes orphan objects by
 rebuilding writer after writes

---
 pypdf/_writer.py     | 47 ++++++++++++++++++++++++++++++++++++-
 tests/test_writer.py | 55 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index a2209c8f4..87f2cd9a2 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -60,6 +60,7 @@
     _get_max_pdf_version_header,
     deprecation_no_replacement,
     logger_warning,
+    logger_error,
 )
 from .constants import AnnotationDictionaryAttributes as AA
 from .constants import CatalogAttributes as CA
@@ -1417,6 +1418,44 @@ def _resolve_links(self) -> None:
                 continue
             new_link.patch_reference(self, new_page)
 
+    def remove_page(self, page: Union[int, PageObject, IndirectObject], clean: bool = False) -> None:
+        """
+        Override PdfDocCommon.remove_page to perform extra cleanup when requested.
+
+        Calls base implementation and then, if clean=True and the writer was already written to,
+        rebuilds the internal structure to remove orphan objects leftover from previous writes.
+        """
+        super().remove_page(page, clean=clean)
+
+        if clean and getattr(self, "_has_written", False):
+            try:
+                self._rebuild_via_io()
+            except Exception as e:
+                logger_warning(f"Failed to rebuild PdfWriter after remove_page with clean=True: {e}", __name__)
+                # No re-raise: keep remove_page robust, tests/CI should catch the error
+
+
+    def _rebuild_via_io(self) -> None:
+        """
+        Rebuild the writer by serializing to an in-memory buffer and re-reading.
+
+        This removes orphan objects that persist after intermediate writes.
+        """
+        from io import BytesIO
+        from pypdf import PdfReader
+
+        buf = BytesIO()
+        self.write(buf)
+        buf.seek(0)
+
+        reader = PdfReader(buf)
+        new_writer = PdfWriter()
+        new_writer.clone_document_from_reader(reader)
+
+        # Replace internal state of this instance with the new clean writer's state
+        self.__dict__.clear()
+        self.__dict__.update(new_writer.__dict__)
+
     def write_stream(self, stream: StreamType) -> None:
         if hasattr(stream, "mode") and "b" not in stream.mode:
             logger_warning(
@@ -1461,7 +1500,13 @@ def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:
             stream = FileIO(stream, "wb")
             my_file = True
 
-        self.write_stream(stream)
+        try:
+            self.write_stream(stream)
+        except Exception as err:
+            logger_error(f"Failed to write to {stream}", __name__)
+            raise err
+        else:
+            self._has_written = True
 
         if my_file:
             stream.close()
diff --git a/tests/test_writer.py b/tests/test_writer.py
index a0b9b3453..cef87b0bd 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2485,6 +2485,61 @@ def test_compress_identical_objects():
     assert len(out2.getvalue()) > len(out3.getvalue())
 
 
+@pytest.mark.enable_socket
+def test_issue_3418_remove_after_writes_leaves_orphans():
+    """
+    Regression test for issue #3418.
+
+    Reproduces the original sequence: write after first append, write after second append,
+    then remove last page + compress. The expectation is that after remove+compress the
+    size is not larger than the original single-page output. If the bug is present,
+    the final size may be larger (i.e. objects were not cleaned up).
+    """
+    url = (
+        "https://github.com/py-pdf/sample-files/blob/main/001-trivial/minimal-document.pdf?raw=true"
+    )
+    name = "minimal-document.pdf"
+    data = BytesIO(get_data_from_url(url, name=name))
+    reader1 = PdfReader(data)
+
+    writer = PdfWriter()
+
+    # Append first page and write it out (this is important; it mutates writer state)
+    writer.append(reader1)
+    out1 = BytesIO()
+    writer.write(out1)
+    size_one_page = out1.tell()
+
+    # Append a second (identical) page and write again
+    # use a fresh reader to match the original issue's steps
+    reader2 = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    writer.append(reader2)
+    out2 = BytesIO()
+    writer.write(out2)
+    size_two_pages = out2.tell()
+
+    assert size_two_pages > size_one_page  # sanity check
+
+    # Now remove the last page and attempt to compress/clean
+    writer.remove_page(len(writer.pages) - 1, clean=True)
+    writer.compress_identical_objects(remove_orphans=True, remove_identicals=True)
+
+    out3 = BytesIO()
+    writer.write(out3)
+    size_after = out3.tell()
+
+    # Expected behavior: after removing + compressing, the size should be <= the original single-page PDF.
+    # This assertion should FAIL on the buggy implementation (showing the regression).
+    assert size_after <= size_one_page, (
+        "After remove + compress the PDF should be no larger than the single-page output. "
+        f"size_one_page={size_one_page}, size_after={size_after}"
+    )
+
+    # And, of course, the output must have one page
+    result_pdf = PdfReader(out3)
+    assert len(result_pdf.pages) == 1
+
+
 def test_set_need_appearances_writer():
     """Minimal test for coverage"""
     writer = PdfWriter()

From a8fdfad9afb803ba26af4937ed2515e3351ede01 Mon Sep 17 00:00:00 2001
From: luis-cd <luiscrujeirasdiaz@gmail.com>
Date: Wed, 13 Aug 2025 02:27:01 +0200
Subject: [PATCH 2/3] Fix #3418: move imports to top-level to satisfy ruff

---
 pypdf/_writer.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 87f2cd9a2..bfd637719 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -59,8 +59,8 @@
     StreamType,
     _get_max_pdf_version_header,
     deprecation_no_replacement,
-    logger_warning,
     logger_error,
+    logger_warning,
 )
 from .constants import AnnotationDictionaryAttributes as AA
 from .constants import CatalogAttributes as CA
@@ -1441,9 +1441,6 @@ def _rebuild_via_io(self) -> None:
 
         This removes orphan objects that persist after intermediate writes.
         """
-        from io import BytesIO
-        from pypdf import PdfReader
-
         buf = BytesIO()
         self.write(buf)
         buf.seek(0)

From 25a96840a54f649c0b2502f8df2870a7d5b759ae Mon Sep 17 00:00:00 2001
From: luis-cd <luiscrujeirasdiaz@gmail.com>
Date: Wed, 13 Aug 2025 02:42:45 +0200
Subject: [PATCH 3/3] ROB: Log warning when PdfWriter rebuild fails in
 remove_page(clean=True)

This change ensures that failures during the internal rebuild process after removing a page
with clean=True do not crash the operation but instead log a warning.

Closes #3418
---
 pypdf/_writer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index bfd637719..4eb27f965 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1430,8 +1430,8 @@ def remove_page(self, page: Union[int, PageObject, IndirectObject], clean: bool
         if clean and getattr(self, "_has_written", False):
             try:
                 self._rebuild_via_io()
-            except Exception as e:
-                logger_warning(f"Failed to rebuild PdfWriter after remove_page with clean=True: {e}", __name__)
+            except Exception as exc:
+                logger_warning(f"Failed to rebuild PdfWriter after remove_page with clean=True: {exc}", __name__)
                 # No re-raise: keep remove_page robust, tests/CI should catch the error