Skip to content

Commit b6fff56

Browse files
BUG: Fix handling of UTF-16 encoded destination titles (#3463)
Closes #3462.
1 parent cff5db6 commit b6fff56

File tree

4 files changed

+284
-11
lines changed

4 files changed

+284
-11
lines changed

pypdf/_doc_common.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -485,11 +485,10 @@ def _get_named_destinations(
485485
names = cast(DictionaryObject, tree[CA.NAMES])
486486
i = 0
487487
while i < len(names):
488-
original_key = names[i].get_object()
488+
key = names[i].get_object()
489489
i += 1
490-
if not isinstance(original_key, (bytes, str)):
490+
if not isinstance(key, (bytes, str)):
491491
continue
492-
key = str(original_key)
493492
try:
494493
value = names[i].get_object()
495494
except IndexError:
@@ -502,7 +501,9 @@ def _get_named_destinations(
502501
continue
503502
dest = self._build_destination(key, value)
504503
if dest is not None:
505-
retval[key] = dest
504+
retval[cast(str, dest["/Title"])] = dest
505+
# Remain backwards-compatible.
506+
retval[str(key)] = dest
506507
else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1)
507508
for k__, v__ in tree.items():
508509
val = v__.get_object()
@@ -928,7 +929,7 @@ def get_destination_page_number(self, destination: Destination) -> Optional[int]
928929

929930
def _build_destination(
930931
self,
931-
title: str,
932+
title: Union[str, bytes],
932933
array: Optional[
933934
list[
934935
Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject]
@@ -948,7 +949,7 @@ def _build_destination(
948949
try:
949950
return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore
950951
except PdfReadError:
951-
logger_warning(f"Unknown destination: {title} {array}", __name__)
952+
logger_warning(f"Unknown destination: {title!r} {array}", __name__)
952953
if self.strict:
953954
raise
954955
# create a link to first Page

pypdf/generic/_data_structures.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1600,7 +1600,7 @@ class Destination(TreeObject):
16001600

16011601
def __init__(
16021602
self,
1603-
title: str,
1603+
title: Union[str, bytes],
16041604
page: Union[NumberObject, IndirectObject, NullObject, DictionaryObject],
16051605
fit: Fit,
16061606
) -> None:

tests/test_doc_common.py

Lines changed: 275 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
"""Test the pypdf._doc_common module."""
2+
import itertools
23
import re
34
import shutil
45
import subprocess
56
from io import BytesIO
7+
from operator import itemgetter
68
from pathlib import Path
79
from unittest import mock
810

911
import pytest
1012

1113
from pypdf import PdfReader, PdfWriter
12-
from pypdf.generic import EmbeddedFile, NullObject, ViewerPreferences
14+
from pypdf.generic import EmbeddedFile, NullObject, TextStringObject, ViewerPreferences
1315
from tests import get_data_from_url
1416

1517
TESTS_ROOT = Path(__file__).parent.resolve()
@@ -121,6 +123,7 @@ def test_byte_encoded_named_destinations():
121123
if action["/S"] == "/GoTo":
122124
named_dest = action["/D"]
123125
assert str(named_dest) in reader.named_destinations
126+
assert TextStringObject(named_dest) in reader.named_destinations
124127

125128
assert reader.named_destinations == {
126129
"Doc-Start": {
@@ -131,14 +134,23 @@ def test_byte_encoded_named_destinations():
131134
"/Top": 667.198,
132135
"/Zoom": NullObject()
133136
},
134-
"楣整搮捡귃㉫㈰爵捡牥汦杩瑨敷杩瑨瑳瑡捩慤慴": {
135-
"/Title": "楣整搮捡귃㉫㈰爵捡牥汦杩瑨敷杩瑨瑳瑡捩慤慴",
137+
"cite.dacÃ\xadk2025racerflightweightstaticdata": {
138+
"/Title": "cite.dacÃ\xadk2025racerflightweightstaticdata",
136139
"/Page": page.indirect_reference,
137140
"/Type": "/XYZ",
138141
"/Left": 133.768,
139142
"/Top": 614.424,
140143
"/Zoom": NullObject()
141144
},
145+
# This is the same as the previous entry, but with `str(name)` instead of the title.
146+
"楣整搮捡귃㉫㈰爵捡牥汦杩瑨敷杩瑨瑳瑡捩慤慴": {
147+
"/Left": 133.768,
148+
"/Page": page.indirect_reference,
149+
"/Title": "cite.dacÃ\xadk2025racerflightweightstaticdata",
150+
"/Top": 614.424,
151+
"/Type": "/XYZ",
152+
"/Zoom": NullObject()
153+
},
142154
"page.1": {
143155
"/Title": "page.1",
144156
"/Page": page.indirect_reference,
@@ -177,3 +189,263 @@ def test_named_destinations__tree_is_null_object():
177189
reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name)))
178190

179191
assert reader.named_destinations == {}
192+
193+
194+
@pytest.mark.enable_socket
195+
def test_outline__issue3462():
196+
url = "https://github.com/user-attachments/files/22293402/e371fffe0b_a7cccde95a.pdf"
197+
name = "issue3462.pdf"
198+
reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name)))
199+
200+
outline_flat = list(
201+
itertools.chain.from_iterable(
202+
entry if isinstance(entry, list) else [entry] for entry in reader.outline
203+
)
204+
)
205+
assert list(map(itemgetter("/Title"), outline_flat)) == [
206+
"AR 2021 - Daftar Isi",
207+
"Page 1",
208+
"Page 2",
209+
"Page 3",
210+
"Page 4",
211+
"Page 5",
212+
"AR 2021 Book 001 (Highlights - Ikhtisar Saham)",
213+
"Page 1",
214+
"Page 2",
215+
"Page 3",
216+
"Page 4",
217+
"Page 5",
218+
"AR 2021 Book 002 (Laporan Manajemen)",
219+
"Page 1",
220+
"Page 2",
221+
"Page 3",
222+
"Page 4",
223+
"Page 5",
224+
"Page 6",
225+
"Page 7",
226+
"Page 8",
227+
"Page 9",
228+
"AR 2021 Book 003-1 (Profil Perusahaan)",
229+
"Page 1",
230+
"Page 2",
231+
"Page 3",
232+
"Page 4",
233+
"Page 5",
234+
"Page 6",
235+
"Page 7",
236+
"Page 8",
237+
"Page 9",
238+
"Page 10",
239+
"Page 11",
240+
"Page 12",
241+
"Page 13",
242+
"Page 14",
243+
"Page 15",
244+
"Page 16",
245+
"Page 17",
246+
"Page 18",
247+
"Page 19",
248+
"Page 20",
249+
"Page 21",
250+
"Page 22",
251+
"Page 23",
252+
"Page 24",
253+
"Page 25",
254+
"Page 26",
255+
"Page 27",
256+
"Page 28",
257+
"Page 29",
258+
"Page 30",
259+
"Page 31",
260+
"Page 32",
261+
"Page 33",
262+
"Page 34",
263+
"Page 35",
264+
"Page 36",
265+
"Page 37",
266+
"Page 38",
267+
"Page 39",
268+
"Page 40",
269+
"Page 41",
270+
"Page 42",
271+
"Page 43",
272+
"Page 44",
273+
"Page 45",
274+
"Page 46",
275+
"Page 47",
276+
"AR 2021 Book 003-2 (Sumber Daya Manusia)",
277+
"Page 1",
278+
"Page 2",
279+
"Page 3",
280+
"Page 4",
281+
"Page 5",
282+
"Page 6",
283+
"Page 7",
284+
"Page 8",
285+
"Page 9",
286+
"Page 10",
287+
"Page 11",
288+
"Page 12",
289+
"AR 2021 Book 003-3 (Komposisi pemegang saham)",
290+
"Page 1",
291+
"Page 2",
292+
"Page 3",
293+
"Page 4",
294+
"Page 5",
295+
"Page 6",
296+
"AR 2021 Book 003-4 (Kronologis Pencatatan Saham)",
297+
"Page 1",
298+
"Page 2",
299+
"AR 2021 Book 003-5 (Akuntan Publik Independen)",
300+
"Page 1",
301+
"Page 2",
302+
"Page 3",
303+
"AR 2021 Book 004 (Analisa dan Pembahasan Manajemen)",
304+
"Page 1",
305+
"Page 2",
306+
"Page 3",
307+
"Page 4",
308+
"Page 5",
309+
"Page 6",
310+
"Page 7",
311+
"Page 8",
312+
"Page 9",
313+
"Page 10",
314+
"Page 11",
315+
"Page 12",
316+
"Page 13",
317+
"Page 14",
318+
"Page 15",
319+
"Page 16",
320+
"Page 17",
321+
"Page 18",
322+
"Page 19",
323+
"Page 20",
324+
"Page 21",
325+
"AR 2021 Book 005-1 (Tata Kelola Perusahaan)",
326+
"Page 1",
327+
"Page 2",
328+
"Page 3",
329+
"Page 4",
330+
"Page 5",
331+
"Page 6",
332+
"Page 7",
333+
"Page 8",
334+
"Page 9",
335+
"Page 10",
336+
"Page 11",
337+
"Page 12",
338+
"AR 2021 Book 005-2 (Direksi-Komisaris)",
339+
"Page 1",
340+
"Page 2",
341+
"Page 3",
342+
"Page 4",
343+
"Page 5",
344+
"Page 6",
345+
"Page 7",
346+
"Page 8",
347+
"Page 9",
348+
"Page 10",
349+
"Page 11",
350+
"Page 12",
351+
"Page 13",
352+
"Page 14",
353+
"Page 15",
354+
"Page 16",
355+
"Page 17",
356+
"Page 18",
357+
"Page 19",
358+
"Page 20",
359+
"Page 21",
360+
"Page 22",
361+
"Page 23",
362+
"Page 24",
363+
"Page 25",
364+
"Page 26",
365+
"Page 27",
366+
"Page 28",
367+
"Page 29",
368+
"Page 30",
369+
"Page 31",
370+
"Page 32",
371+
"Page 33",
372+
"Page 34",
373+
"Page 35",
374+
"Page 36",
375+
"Page 37",
376+
"Page 38",
377+
"AR 2021 Book 005-3 (Komite Audit)",
378+
"Page 1",
379+
"Page 2",
380+
"Page 3",
381+
"Page 4",
382+
"Page 5",
383+
"Page 6",
384+
"Page 7",
385+
"Page 8",
386+
"Page 9",
387+
"AR 2021 Book 005-4 (Sekretaris Perusahaan)",
388+
"Page 1",
389+
"Page 2",
390+
"Page 3",
391+
"Page 4",
392+
"Page 5",
393+
"Page 6",
394+
"Page 7",
395+
"Page 8",
396+
"Page 9",
397+
"Page 10",
398+
"AR 2021 Book 005-5 (Unit Audit Internal)",
399+
"Page 1",
400+
"Page 2",
401+
"Page 3",
402+
"Page 4",
403+
"Page 5",
404+
"Page 6",
405+
"AR 2021 Book 005-6 (Sistem Pengendalian Internal)",
406+
"Page 1",
407+
"Page 2",
408+
"Page 3",
409+
"Page 4",
410+
"Page 5",
411+
"Page 6",
412+
"Page 7",
413+
"Page 8",
414+
"AR 2021 Book 005-7 (Program Saham)",
415+
"Page 1",
416+
"AR 2021 Book 005-8 ( Whistleblowing)",
417+
"Page 1",
418+
"Page 2",
419+
"Page 3",
420+
"Page 4",
421+
"Page 5",
422+
"Page 6",
423+
"Page 7",
424+
"Page 8",
425+
"Page 9",
426+
"Page 10",
427+
"Page 11",
428+
"Page 12",
429+
"Page 13",
430+
"Page 14",
431+
"Page 15",
432+
"Page 16",
433+
"Page 17",
434+
"Page 18",
435+
"Page 19",
436+
"Page 20",
437+
"Page 21",
438+
"Page 22",
439+
"Page 23",
440+
"Page 24",
441+
"Page 25",
442+
"AR 2021 Book 006 (Tanggung Jawab Sosial - CSR)",
443+
"Page 1",
444+
"Page 2",
445+
"AR 2021 Book 007-1 (LAPORAN KEUANGAN KONSOLIDASIAN)",
446+
"Page 1",
447+
"AR 2021 Book 007-2 (Isi Laporan Keuangan)",
448+
"AR 2021 Book 008 (Tanggung Jawab Atas Laporan Tahunan)",
449+
"Page 1",
450+
"Page 2"
451+
]

tests/test_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -709,7 +709,7 @@ def test_issue604(caplog, strict):
709709
pdf = PdfReader(f, strict=strict)
710710
outline = pdf.outline
711711
msg = [
712-
"Unknown destination: ms_Thyroid_2_2020_071520_watermarked.pdf [0, 1]"
712+
"Unknown destination: 'ms_Thyroid_2_2020_071520_watermarked.pdf' [0, 1]"
713713
]
714714
assert normalize_warnings(caplog.text) == msg
715715

0 commit comments

Comments
 (0)