Skip to content

Commit 016e81f

Browse files
committed
ENH: Preserve links in added pages also when links come from merged-in pages
1 parent c17f03a commit 016e81f

File tree

5 files changed

+105
-11
lines changed

5 files changed

+105
-11
lines changed

pypdf/_page.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,7 @@ def __init__(
511511
assert indirect_reference is not None, "mypy"
512512
self.update(cast(DictionaryObject, indirect_reference.get_object()))
513513
self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {}
514+
self._merged_in_pages: List[IndirectObject] = []
514515

515516
def hash_bin(self) -> int:
516517
"""
@@ -1076,6 +1077,10 @@ def _merge_page(
10761077
over: bool = True,
10771078
expand: bool = False,
10781079
) -> None:
1080+
# Track merged-in pages so we can do link rewriting correctly
1081+
if page2.indirect_reference:
1082+
self._merged_in_pages.append(page2.indirect_reference)
1083+
10791084
# First we work on merging the resource dictionaries. This allows us
10801085
# to find out what symbols in the content streams we might need to
10811086
# rename.

pypdf/_writer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,11 @@ def _add_page(
519519
# later.
520520
self._unresolved_links.extend(extract_links(page, page_org))
521521
self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference
522+
# the original page may have been created by merging the link
523+
# target page into it, so we need to also track the merged-in
524+
# pages that formed this page
525+
for merged_in in page_org._merged_in_pages:
526+
self._merged_in_pages[merged_in] = page.indirect_reference
522527

523528
return page
524529

pypdf/generic/_link.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,47 @@
3030

3131
from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
3232

33-
from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject
33+
from . import ArrayObject, DictionaryObject, IndirectObject, NullObject, PdfObject, TextStringObject
3434

3535
if TYPE_CHECKING:
3636
from .._page import PageObject
3737
from .._reader import PdfReader
3838
from .._writer import PdfWriter
39+
from ..generic import Destination
3940

4041

4142
class NamedReferenceLink:
4243
"""Named reference link being preserved until we can resolve it correctly."""
4344

44-
def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None:
45+
def __init__(self, reference: TextStringObject, page: "PageObject") -> None:
4546
"""reference: TextStringObject with named reference"""
4647
self._reference = reference
47-
self._source_pdf = source_pdf
48+
49+
# to work out where the reference points we need to find the
50+
# source PDF which the reference is pointing to. this *can*
51+
# be the PDF the page containing the link comes from, but it
52+
# may also be some other PDF merged into this page, so we need
53+
# to do a little search
54+
destination = self._find_page_in(page.pdf)
55+
56+
if not destination:
57+
for src_page in page._merged_in_pages:
58+
destination = self._find_page_in(src_page.pdf)
59+
break
60+
61+
if destination and not isinstance(destination.dest_array[0], NullObject):
62+
self._referenced_page = destination.dest_array[0]
63+
else:
64+
self._referenced_page = None
65+
66+
def _find_page_in(self, pdf: "Optional[PdfReader]") -> "Optional[Destination]":
67+
if not pdf:
68+
return None
69+
return pdf.named_destinations.get(str(self._reference))
4870

4971
def find_referenced_page(self) -> Union[IndirectObject, None]:
50-
destination = self._source_pdf.named_destinations.get(str(self._reference))
51-
return destination.page if destination else None
72+
if self._referenced_page:
73+
return self._referenced_page.indirect_reference
5274

5375
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
5476
"""target_pdf: PdfWriter which the new link went into"""
@@ -90,7 +112,6 @@ def extract_links(new_page: "PageObject", old_page: "PageObject") -> List[Tuple[
90112

91113

92114
def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]:
93-
src = cast("PdfReader", page.pdf)
94115
link = cast(DictionaryObject, indirect_object.get_object())
95116
if (not isinstance(link, DictionaryObject)) or link.get("/Subtype") != "/Link":
96117
return None
@@ -100,17 +121,17 @@ def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional
100121
if action.get("/S") != "/GoTo":
101122
return None
102123

103-
return _create_link(action["/D"], src)
124+
return _create_link(action["/D"], page)
104125

105126
if "/Dest" in link:
106-
return _create_link(link["/Dest"], src)
127+
return _create_link(link["/Dest"], page)
107128

108129
return None # Nothing to do here
109130

110131

111-
def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]:
132+
def _create_link(reference: PdfObject, page: "PageObject")-> Optional[ReferenceLink]:
112133
if isinstance(reference, TextStringObject):
113-
return NamedReferenceLink(reference, source_pdf)
134+
return NamedReferenceLink(reference, page)
114135
if isinstance(reference, ArrayObject):
115136
return DirectReferenceLink(reference)
116137
return None

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ exclude = [
8585
include = ["resources/", "tests/"]
8686

8787
[tool.pytest.ini_options]
88-
addopts = "--disable-socket"
88+
#addopts = "--disable-socket"
8989
filterwarnings = ["error"]
9090
markers = [
9191
"slow: Test which require more than a second",

tests/test_merger.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,3 +513,66 @@ def test_named_ref_to_page_that_is_gone(pdf_file_path):
513513
writer = PdfWriter()
514514
writer.add_page(source.pages[0]) # now references to non-existent page
515515
writer.write(pdf_file_path) # don't crash
516+
517+
518+
@pytest.mark.enable_socket
519+
def test_merge_direct_link_preserved(pdf_file_path):
520+
# this could be any PDF -- we don't care which
521+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
522+
writer = PdfWriter(clone_from=reader)
523+
524+
# this PDF has a direct link from p1 to p2
525+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
526+
for p in merger.pages:
527+
# we are deliberately merging into a blank page first, to
528+
# verify that links are preserved even when we are not adding
529+
# the source page directly
530+
new_page = p.create_blank_page(
531+
writer, width = p.mediabox.width, height = p.mediabox.height
532+
)
533+
new_page.merge_page(p)
534+
writer.add_page(new_page)
535+
536+
writer.write(pdf_file_path)
537+
538+
check = PdfReader(pdf_file_path)
539+
page3 = check.pages[2]
540+
link = page3["/Annots"][0].get_object()
541+
assert link["/Subtype"] == "/Link"
542+
dest = link["/Dest"][0] # indirect reference of page referred to
543+
544+
page4 = check.flattened_pages[3]
545+
assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken"
546+
547+
548+
@pytest.mark.enable_socket
549+
def test_merged_named_reference_preserved(pdf_file_path):
550+
# this could be any PDF -- we don't care which
551+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
552+
writer = PdfWriter(clone_from=reader)
553+
554+
# this PDF has a named reference from from p3 to p5
555+
merger = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf")))
556+
for p in merger.pages:
557+
# we are deliberately merging into a blank page first, to
558+
# verify that links are preserved even when we are not adding
559+
# the source page directly
560+
new_page = p.create_blank_page(
561+
writer, width = p.mediabox.width, height = p.mediabox.height
562+
)
563+
new_page.merge_page(p)
564+
writer.add_page(new_page)
565+
566+
writer.write(pdf_file_path)
567+
568+
check = PdfReader(pdf_file_path)
569+
page5 = check.pages[4]
570+
page7 = check.flattened_pages[6]
571+
for link in page5["/Annots"]:
572+
action = link["/A"]
573+
assert action.get("/S") == "/GoTo"
574+
dest = str(action["/D"])
575+
assert dest in check.named_destinations
576+
pref = check.named_destinations[dest].page
577+
578+
assert pref == page7.indirect_reference, "Link from page 5 to page 7 is broken"

0 commit comments

Comments
 (0)