Skip to content

Commit 5a9a0da

Browse files
BUG: Avoid sharing array-based content streams between pages (#3681)
Otherwise, applying a transformation to one of the pages renders all pages referencing the shared objects invalid due to `PageObject.replace_contents` setting them to a `NullObject` without being able to reliable check for further usages. Closes #3680.
1 parent a3451e8 commit 5a9a0da

File tree

4 files changed

+64
-3
lines changed

4 files changed

+64
-3
lines changed

pypdf/generic/_data_structures.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,13 @@ def clone(
136136
force_duplicate,
137137
)
138138
arr.append(dup.indirect_reference)
139+
elif isinstance(data, IndirectObject) and isinstance(resolved := data.get_object(), StreamObject):
140+
dup = data._reference_clone(
141+
resolved.clone(pdf_dest, force_duplicate=True, ignore_fields=ignore_fields),
142+
pdf_dest,
143+
force_duplicate,
144+
)
145+
arr.append(dup.indirect_reference)
139146
elif hasattr(data, "clone"):
140147
arr.append(data.clone(pdf_dest, force_duplicate, ignore_fields))
141148
else:
705 Bytes
Binary file not shown.

tests/generic/test_data_structures.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,16 @@
1010

1111
from pypdf import PdfReader, PdfWriter
1212
from pypdf.errors import LimitReachedError
13-
from pypdf.generic import DictionaryObject, NameObject, RectangleObject, TreeObject
13+
from pypdf.generic import (
14+
ArrayObject,
15+
ContentStream,
16+
DictionaryObject,
17+
NameObject,
18+
NullObject,
19+
RectangleObject,
20+
StreamObject,
21+
TreeObject,
22+
)
1423
from tests import RESOURCE_ROOT, get_data_from_url
1524

1625
try:
@@ -67,6 +76,50 @@ def test_array_object__clone_same_object_multiple_times(caplog):
6776
assert caplog.messages == []
6877

6978

79+
def test_array_object__clone_same_stream_multiple_times():
80+
writer = PdfWriter()
81+
82+
# Unique streams.
83+
stream1 = StreamObject()
84+
stream1.set_data(b"Hello World!")
85+
stream2 = StreamObject()
86+
stream2.set_data(b"Lorem ipsum!")
87+
88+
# Shared streams.
89+
shared_streams = [StreamObject() for _ in range(3)]
90+
[shared_stream.set_data(f"Shared stream {index}".encode()) for index, shared_stream in enumerate(shared_streams)]
91+
92+
# Add to writer.
93+
writer._add_object(stream1)
94+
writer._add_object(stream2)
95+
shared_references = [writer._add_object(shared_stream) for shared_stream in shared_streams]
96+
97+
# Arrays.
98+
array1 = ArrayObject([stream1.indirect_reference, *shared_references])
99+
array2 = ArrayObject([stream2.indirect_reference, *shared_references])
100+
101+
# Cloned.
102+
cloned1 = array1.clone(pdf_dest=writer)
103+
cloned2 = array2.clone(pdf_dest=writer)
104+
105+
# Nullify one shared object.
106+
writer._replace_object(shared_references[1].indirect_reference, NullObject())
107+
108+
# The first entry is always different. The remaining shared entries should be dedicated copies.
109+
assert cloned1[1:] != cloned2[1:]
110+
111+
assert ContentStream(stream=array1, pdf=None).get_data() == b"Hello World!\nShared stream 0\nShared stream 2\n"
112+
assert ContentStream(stream=array2, pdf=None).get_data() == b"Lorem ipsum!\nShared stream 0\nShared stream 2\n"
113+
assert (
114+
ContentStream(stream=cloned1, pdf=None).get_data() ==
115+
b"Hello World!\nShared stream 0\nShared stream 1\nShared stream 2\n"
116+
)
117+
assert (
118+
ContentStream(stream=cloned2, pdf=None).get_data() ==
119+
b"Lorem ipsum!\nShared stream 0\nShared stream 1\nShared stream 2\n"
120+
)
121+
122+
70123
@pytest.mark.enable_socket
71124
def test_dictionary_object__read_from_stream__limit():
72125
name = "read_from_stream__length_2gb.pdf"

tests/test_page.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,8 +1524,9 @@ def test_replace_contents__null_object_cloning_error():
15241524
new_page = writer.add_page(page)
15251525
new_page.scale_by(1)
15261526

1527-
assert isinstance(writer.get_object(50)["/Contents"], ContentStream)
1528-
assert isinstance(writer.get_object(51), NullObject)
1527+
page4_idnum = writer.pages[3].indirect_reference.idnum
1528+
assert isinstance(writer.get_object(page4_idnum)["/Contents"], ContentStream)
1529+
assert isinstance(writer.get_object(page4_idnum + 1), NullObject)
15291530

15301531
data = BytesIO()
15311532
writer.write(data)

0 commit comments

Comments
 (0)