From 164b8f3506d14f1bd0c774c82aba3b77545ceec3 Mon Sep 17 00:00:00 2001 From: luis-cd Date: Wed, 13 Aug 2025 02:08:08 +0200 Subject: [PATCH 1/3] Fix #3418: Clean remove_page removes orphan objects by rebuilding writer after writes --- pypdf/_writer.py | 47 ++++++++++++++++++++++++++++++++++++- tests/test_writer.py | 55 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index a2209c8f4..87f2cd9a2 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -60,6 +60,7 @@ _get_max_pdf_version_header, deprecation_no_replacement, logger_warning, + logger_error, ) from .constants import AnnotationDictionaryAttributes as AA from .constants import CatalogAttributes as CA @@ -1417,6 +1418,44 @@ def _resolve_links(self) -> None: continue new_link.patch_reference(self, new_page) + def remove_page(self, page: Union[int, PageObject, IndirectObject], clean: bool = False) -> None: + """ + Override PdfDocCommon.remove_page to perform extra cleanup when requested. + + Calls base implementation and then, if clean=True and the writer was already written to, + rebuilds the internal structure to remove orphan objects leftover from previous writes. + """ + super().remove_page(page, clean=clean) + + if clean and getattr(self, "_has_written", False): + try: + self._rebuild_via_io() + except Exception as e: + logger_warning(f"Failed to rebuild PdfWriter after remove_page with clean=True: {e}", __name__) + # No re-raise: keep remove_page robust, tests/CI should catch the error + + + def _rebuild_via_io(self) -> None: + """ + Rebuild the writer by serializing to an in-memory buffer and re-reading. + + This removes orphan objects that persist after intermediate writes. + """ + from io import BytesIO + from pypdf import PdfReader + + buf = BytesIO() + self.write(buf) + buf.seek(0) + + reader = PdfReader(buf) + new_writer = PdfWriter() + new_writer.clone_document_from_reader(reader) + + # Replace internal state of this instance with the new clean writer's state + self.__dict__.clear() + self.__dict__.update(new_writer.__dict__) + def write_stream(self, stream: StreamType) -> None: if hasattr(stream, "mode") and "b" not in stream.mode: logger_warning( @@ -1461,7 +1500,13 @@ def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]: stream = FileIO(stream, "wb") my_file = True - self.write_stream(stream) + try: + self.write_stream(stream) + except Exception as err: + logger_error(f"Failed to write to {stream}", __name__) + raise err + else: + self._has_written = True if my_file: stream.close() diff --git a/tests/test_writer.py b/tests/test_writer.py index a0b9b3453..cef87b0bd 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2485,6 +2485,61 @@ def test_compress_identical_objects(): assert len(out2.getvalue()) > len(out3.getvalue()) +@pytest.mark.enable_socket +def test_issue_3418_remove_after_writes_leaves_orphans(): + """ + Regression test for issue #3418. + + Reproduces the original sequence: write after first append, write after second append, + then remove last page + compress. The expectation is that after remove+compress the + size is not larger than the original single-page output. If the bug is present, + the final size may be larger (i.e. objects were not cleaned up). + """ + url = ( + "https://github.com/py-pdf/sample-files/blob/main/001-trivial/minimal-document.pdf?raw=true" + ) + name = "minimal-document.pdf" + data = BytesIO(get_data_from_url(url, name=name)) + reader1 = PdfReader(data) + + writer = PdfWriter() + + # Append first page and write it out (this is important; it mutates writer state) + writer.append(reader1) + out1 = BytesIO() + writer.write(out1) + size_one_page = out1.tell() + + # Append a second (identical) page and write again + # use a fresh reader to match the original issue's steps + reader2 = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer.append(reader2) + out2 = BytesIO() + writer.write(out2) + size_two_pages = out2.tell() + + assert size_two_pages > size_one_page # sanity check + + # Now remove the last page and attempt to compress/clean + writer.remove_page(len(writer.pages) - 1, clean=True) + writer.compress_identical_objects(remove_orphans=True, remove_identicals=True) + + out3 = BytesIO() + writer.write(out3) + size_after = out3.tell() + + # Expected behavior: after removing + compressing, the size should be <= the original single-page PDF. + # This assertion should FAIL on the buggy implementation (showing the regression). + assert size_after <= size_one_page, ( + "After remove + compress the PDF should be no larger than the single-page output. " + f"size_one_page={size_one_page}, size_after={size_after}" + ) + + # And, of course, the output must have one page + result_pdf = PdfReader(out3) + assert len(result_pdf.pages) == 1 + + def test_set_need_appearances_writer(): """Minimal test for coverage""" writer = PdfWriter() From a8fdfad9afb803ba26af4937ed2515e3351ede01 Mon Sep 17 00:00:00 2001 From: luis-cd Date: Wed, 13 Aug 2025 02:27:01 +0200 Subject: [PATCH 2/3] Fix #3418: move imports to top-level to satisfy ruff --- pypdf/_writer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 87f2cd9a2..bfd637719 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -59,8 +59,8 @@ StreamType, _get_max_pdf_version_header, deprecation_no_replacement, - logger_warning, logger_error, + logger_warning, ) from .constants import AnnotationDictionaryAttributes as AA from .constants import CatalogAttributes as CA @@ -1441,9 +1441,6 @@ def _rebuild_via_io(self) -> None: This removes orphan objects that persist after intermediate writes. """ - from io import BytesIO - from pypdf import PdfReader - buf = BytesIO() self.write(buf) buf.seek(0) From 25a96840a54f649c0b2502f8df2870a7d5b759ae Mon Sep 17 00:00:00 2001 From: luis-cd Date: Wed, 13 Aug 2025 02:42:45 +0200 Subject: [PATCH 3/3] ROB: Log warning when PdfWriter rebuild fails in remove_page(clean=True) This change ensures that failures during the internal rebuild process after removing a page with clean=True do not crash the operation but instead log a warning. Closes #3418 --- pypdf/_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index bfd637719..4eb27f965 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1430,8 +1430,8 @@ def remove_page(self, page: Union[int, PageObject, IndirectObject], clean: bool if clean and getattr(self, "_has_written", False): try: self._rebuild_via_io() - except Exception as e: - logger_warning(f"Failed to rebuild PdfWriter after remove_page with clean=True: {e}", __name__) + except Exception as exc: + logger_warning(f"Failed to rebuild PdfWriter after remove_page with clean=True: {exc}", __name__) # No re-raise: keep remove_page robust, tests/CI should catch the error