Skip to content

Commit 9608900

Browse files
authored
Merge branch 'main' into refactor-page-copilot
2 parents 347bdf4 + 442e8d5 commit 9608900

File tree

5 files changed

+102
-2
lines changed

5 files changed

+102
-2
lines changed

pypdf/_writer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1495,7 +1495,7 @@ def _write_increment(self, stream: StreamType) -> None:
14951495
)
14961496
)
14971497
xr.write_to_stream(stream)
1498-
stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1498+
stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
14991499

15001500
def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
15011501
object_positions = []

pypdf/filters.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,16 @@ def decode(
385385
length = data[index]
386386
index += 1
387387
if length == 128:
388-
if index < len(data):
388+
data_length = len(data)
389+
if index < data_length:
390+
# We should first check, if we have an inner stream from a multi-encoded
391+
# stream with a faulty trailing newline that we can decode properly.
392+
# We will just ignore the last byte and raise a warning ...
393+
if (index == data_length - 1) and (data[index : index+1] == b"\n"):
394+
logger_warning(
395+
"Found trailing newline in stream data, check if output is OK", __name__
396+
)
397+
break
389398
raise PdfStreamError("Early EOD in RunLengthDecode")
390399
break
391400
if length < 128:

tests/test_filters.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
CCITTParameters,
2323
FlateDecode,
2424
JBIG2Decode,
25+
RunLengthDecode,
2526
)
2627
from pypdf.generic import (
2728
ArrayObject,
@@ -816,3 +817,38 @@ def test_flate_decode_stream_with_faulty_tail_bytes():
816817
reader = PdfReader(BytesIO(data))
817818
obj = reader.get_object(IndirectObject(182, 0, reader))
818819
assert cast(StreamObject, obj).get_data() == expected
820+
821+
822+
@pytest.mark.enable_socket
823+
def test_rle_decode_with_faulty_tail_byte_in_multi_encoded_stream(caplog):
824+
"""
825+
Test for #3355
826+
827+
The test ensures that the inner RLE encoded stream can be decoded,
828+
because this stream contains an extra faulty newline byte in the
829+
end that can be ignored during decoding.
830+
"""
831+
data = get_data_from_url(
832+
url="https://github.com/user-attachments/files/21038398/test_data_rle.txt",
833+
name="multi_decoding_example_with_faulty_tail_byte.pdf"
834+
)
835+
reader = PdfReader(BytesIO(data))
836+
obj = reader.get_object(IndirectObject(60, 0, reader))
837+
cast(StreamObject, obj).get_data()
838+
assert "Found trailing newline in stream data, check if output is OK" in caplog.messages
839+
840+
841+
@pytest.mark.enable_socket
842+
def test_rle_decode_exception_with_corrupted_stream():
843+
"""
844+
Additional Test to #3355
845+
846+
This test must raise the EOD exception during RLE decoding and ensures
847+
that we do not fail during code coverage analyses in the git PR pipeline.
848+
"""
849+
data = get_data_from_url(
850+
url="https://github.com/user-attachments/files/21052626/rle_stream_with_error.txt",
851+
name="rle_stream_with_error.txt"
852+
)
853+
with pytest.raises(PdfStreamError, match="Early EOD in RunLengthDecode"):
854+
RunLengthDecode.decode(data)

tests/test_text_extraction.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,3 +414,45 @@ def test_rotated_layout_mode(caplog):
414414
assert not caplog.records, "No warnings should be issued"
415415
assert text, "Text matching the page rotation should be extracted"
416416
assert re.search(r"\r?\n +69\r?\n +UNCLASSIFIED$", text), "Contents should be in expected layout"
417+
418+
419+
@pytest.mark.enable_socket
420+
@pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning")
421+
def test_extract_text__none_objects():
422+
url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf"
423+
name = "tika-957721.pdf"
424+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
425+
426+
reader.pages[0].extract_text()
427+
reader.pages[8].extract_text()
428+
429+
430+
@pytest.mark.enable_socket
431+
def test_extract_text__with_visitor_text():
432+
def visitor_text(*args, **kwargs): # noqa: ANN002, ANN003, ANN202
433+
pass
434+
435+
url = "https://github.com/user-attachments/files/18381718/tika-952016.pdf"
436+
name = "tika-952016.pdf"
437+
stream = BytesIO(get_data_from_url(url, name=name))
438+
reader = PdfReader(stream)
439+
page = reader.pages[0]
440+
page.extract_text(visitor_text=visitor_text)
441+
442+
reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf")))
443+
page = reader.pages[0]
444+
page.extract_text(visitor_text=visitor_text)
445+
446+
447+
@pytest.mark.enable_socket
448+
def test_extract_text__restore_cm_stack_pop_error():
449+
url = "https://github.com/user-attachments/files/18381737/tika-966635.pdf"
450+
name = "tika-966635.pdf"
451+
stream = BytesIO(get_data_from_url(url, name=name))
452+
reader = PdfReader(stream)
453+
page = reader.pages[10]
454+
455+
# There is a previous error we already omit ("pop from empty list"), thus
456+
# check for the message explicitly here.
457+
with pytest.raises(IndexError, match="list index out of range"):
458+
page.extract_text()

tests/test_writer.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2769,3 +2769,16 @@ def test_insert_filtered_annotations__annotations_are_no_list(caplog):
27692769
"'/Normal'} of type DictionaryObject."
27702770
)
27712771
]
2772+
2773+
2774+
def test_unterminated_object__with_incremental_writer():
2775+
"""Test for #3118"""
2776+
reader = PdfReader(RESOURCE_ROOT / "bytes.pdf")
2777+
writer = PdfWriter(reader, incremental=True)
2778+
2779+
writer.add_blank_page(72, 72)
2780+
2781+
fi = BytesIO()
2782+
writer.write(fi)
2783+
b = fi.getvalue()
2784+
assert b[-39:] == b"\nendstream\nendobj\nstartxref\n1240\n%%EOF\n"

0 commit comments

Comments
 (0)