diff --git a/pypdf/_page.py b/pypdf/_page.py index 8dda9cf10..9c98d8b22 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -511,6 +511,7 @@ def __init__( assert indirect_reference is not None, "mypy" self.update(cast(DictionaryObject, indirect_reference.get_object())) self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {} + self._merged_in_pages: List[IndirectObject] = [] def hash_bin(self) -> int: """ @@ -1076,6 +1077,10 @@ def _merge_page( over: bool = True, expand: bool = False, ) -> None: + # Track merged-in pages so we can do link rewriting correctly + if page2.indirect_reference: + self._merged_in_pages.append(page2.indirect_reference) + # First we work on merging the resource dictionaries. This allows us # to find out what symbols in the content streams we might need to # rename. diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 2f43cf544..9acd0c988 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -519,6 +519,11 @@ def _add_page( # later. self._unresolved_links.extend(extract_links(page, page_org)) self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference + # the original page may have been created by merging the link + # target page into it, so we need to also track the merged-in + # pages that formed this page + for merged_in in page_org._merged_in_pages: + self._merged_in_pages[merged_in] = page.indirect_reference return page diff --git a/pypdf/generic/_link.py b/pypdf/generic/_link.py index 68161e042..e8203117e 100644 --- a/pypdf/generic/_link.py +++ b/pypdf/generic/_link.py @@ -30,25 +30,50 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast -from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject +from . import ArrayObject, DictionaryObject, IndirectObject, NullObject, PdfObject, TextStringObject if TYPE_CHECKING: from .._page import PageObject + from .._protocols import PdfCommonDocProtocol from .._reader import PdfReader from .._writer import PdfWriter + from ..generic import Destination class NamedReferenceLink: """Named reference link being preserved until we can resolve it correctly.""" - def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None: + def __init__(self, reference: TextStringObject, page: "PageObject") -> None: """reference: TextStringObject with named reference""" self._reference = reference - self._source_pdf = source_pdf + + # to work out where the reference points we need to find the + # source PDF which the reference is pointing to. this *can* + # be the PDF the page containing the link comes from, but it + # may also be some other PDF merged into this page, so we need + # to do a little search + destination = self._find_page_in(page.pdf) + + if not destination: + for src_page in page._merged_in_pages: + destination = self._find_page_in(src_page.pdf) + break + + if destination and not isinstance(destination.dest_array[0], NullObject): + self._referenced_page = destination.dest_array[0] + else: + self._referenced_page = None + + def _find_page_in(self, pdf: "Optional[PdfCommonDocProtocol]") -> "Optional[Destination]": + if not pdf or not hasattr(pdf, "named_destinations"): + return None + reader: PdfReader = cast("PdfReader", pdf) + return reader.named_destinations.get(str(self._reference)) def find_referenced_page(self) -> Union[IndirectObject, None]: - destination = self._source_pdf.named_destinations.get(str(self._reference)) - return destination.page if destination else None + if self._referenced_page: + return self._referenced_page.indirect_reference + return None def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None: """target_pdf: PdfWriter which the new link went into""" @@ -90,7 +115,6 @@ def extract_links(new_page: "PageObject", old_page: "PageObject") -> List[Tuple[ def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]: - src = cast("PdfReader", page.pdf) link = cast(DictionaryObject, indirect_object.get_object()) if (not isinstance(link, DictionaryObject)) or link.get("/Subtype") != "/Link": return None @@ -100,17 +124,17 @@ def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional if action.get("/S") != "/GoTo": return None - return _create_link(action["/D"], src) + return _create_link(action["/D"], page) if "/Dest" in link: - return _create_link(link["/Dest"], src) + return _create_link(link["/Dest"], page) return None # Nothing to do here -def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]: +def _create_link(reference: PdfObject, page: "PageObject")-> Optional[ReferenceLink]: if isinstance(reference, TextStringObject): - return NamedReferenceLink(reference, source_pdf) + return NamedReferenceLink(reference, page) if isinstance(reference, ArrayObject): return DirectReferenceLink(reference) return None diff --git a/pyproject.toml b/pyproject.toml index a28f52dcf..5e6e522c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,7 +85,7 @@ exclude = [ include = ["resources/", "tests/"] [tool.pytest.ini_options] -addopts = "--disable-socket" +#addopts = "--disable-socket" filterwarnings = ["error"] markers = [ "slow: Test which require more than a second", diff --git a/tests/test_merger.py b/tests/test_merger.py index 5c97fc15d..8294693f0 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -513,3 +513,66 @@ def test_named_ref_to_page_that_is_gone(pdf_file_path): writer = PdfWriter() writer.add_page(source.pages[0]) # now references to non-existent page writer.write(pdf_file_path) # don't crash + + +@pytest.mark.enable_socket +def test_merge_direct_link_preserved(pdf_file_path): + # this could be any PDF -- we don't care which + reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) + writer = PdfWriter(clone_from=reader) + + # this PDF has a direct link from p1 to p2 + merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf"))) + for p in merger.pages: + # we are deliberately merging into a blank page first, to + # verify that links are preserved even when we are not adding + # the source page directly + new_page = p.create_blank_page( + writer, width = p.mediabox.width, height = p.mediabox.height + ) + new_page.merge_page(p) + writer.add_page(new_page) + + writer.write(pdf_file_path) + + check = PdfReader(pdf_file_path) + page3 = check.pages[2] + link = page3["/Annots"][0].get_object() + assert link["/Subtype"] == "/Link" + dest = link["/Dest"][0] # indirect reference of page referred to + + page4 = check.flattened_pages[3] + assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken" + + +@pytest.mark.enable_socket +def test_merged_named_reference_preserved(pdf_file_path): + # this could be any PDF -- we don't care which + reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) + writer = PdfWriter(clone_from=reader) + + # this PDF has a named reference from from p3 to p5 + merger = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf"))) + for p in merger.pages: + # we are deliberately merging into a blank page first, to + # verify that links are preserved even when we are not adding + # the source page directly + new_page = p.create_blank_page( + writer, width = p.mediabox.width, height = p.mediabox.height + ) + new_page.merge_page(p) + writer.add_page(new_page) + + writer.write(pdf_file_path) + + check = PdfReader(pdf_file_path) + page5 = check.pages[4] + page7 = check.flattened_pages[6] + for link in page5["/Annots"]: + action = link["/A"] + assert action.get("/S") == "/GoTo" + dest = str(action["/D"]) + assert dest in check.named_destinations + pref = check.named_destinations[dest].page + + assert pref == page7.indirect_reference, "Link from page 5 to page 7 is broken"