Merge branch 'main' into refactor-page-copilot

MartinThoma · web-flow · commit 96089009be47 · 2025-07-04T21:58:36.000+02:00
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -1495,7 +1495,7 @@ def _write_increment(self, stream: StreamType) -> None:
             )
         )
         xr.write_to_stream(stream)
-        stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode())  # eof
+        stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode())  # eof
 
     def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
         object_positions = []
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -385,7 +385,16 @@ def decode(
             length = data[index]
             index += 1
             if length == 128:
-                if index < len(data):
+                data_length = len(data)
+                if index < data_length:
+                    # We should first check, if we have an inner stream from a multi-encoded
+                    # stream with a faulty trailing newline that we can decode properly.
+                    # We will just ignore the last byte and raise a warning ...
+                    if (index == data_length - 1) and (data[index : index+1] == b"\n"):
+                        logger_warning(
+                            "Found trailing newline in stream data, check if output is OK", __name__
+                        )
+                        break
                     raise PdfStreamError("Early EOD in RunLengthDecode")
                 break
             if length < 128:
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -22,6 +22,7 @@
     CCITTParameters,
     FlateDecode,
     JBIG2Decode,
+    RunLengthDecode,
 )
 from pypdf.generic import (
     ArrayObject,
@@ -816,3 +817,38 @@ def test_flate_decode_stream_with_faulty_tail_bytes():
     reader = PdfReader(BytesIO(data))
     obj = reader.get_object(IndirectObject(182, 0, reader))
     assert cast(StreamObject, obj).get_data() == expected
+
+
+@pytest.mark.enable_socket
+def test_rle_decode_with_faulty_tail_byte_in_multi_encoded_stream(caplog):
+    """
+    Test for #3355
+
+    The test ensures that the inner RLE encoded stream can be decoded,
+    because this stream contains an extra faulty newline byte in the
+    end that can be ignored during decoding.
+    """
+    data = get_data_from_url(
+        url="https://github.com/user-attachments/files/21038398/test_data_rle.txt",
+        name="multi_decoding_example_with_faulty_tail_byte.pdf"
+    )
+    reader = PdfReader(BytesIO(data))
+    obj = reader.get_object(IndirectObject(60, 0, reader))
+    cast(StreamObject, obj).get_data()
+    assert "Found trailing newline in stream data, check if output is OK" in caplog.messages
+
+
+@pytest.mark.enable_socket
+def test_rle_decode_exception_with_corrupted_stream():
+    """
+    Additional Test to #3355
+
+    This test must raise the EOD exception during RLE decoding and ensures
+    that we do not fail during code coverage analyses in the git PR pipeline.
+    """
+    data = get_data_from_url(
+        url="https://github.com/user-attachments/files/21052626/rle_stream_with_error.txt",
+        name="rle_stream_with_error.txt"
+    )
+    with pytest.raises(PdfStreamError, match="Early EOD in RunLengthDecode"):
+        RunLengthDecode.decode(data)
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
@@ -414,3 +414,45 @@ def test_rotated_layout_mode(caplog):
     assert not caplog.records, "No warnings should be issued"
     assert text, "Text matching the page rotation should be extracted"
     assert re.search(r"\r?\n +69\r?\n +UNCLASSIFIED$", text), "Contents should be in expected layout"
+
+
+@pytest.mark.enable_socket
+@pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning")
+def test_extract_text__none_objects():
+    url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf"
+    name = "tika-957721.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+
+    reader.pages[0].extract_text()
+    reader.pages[8].extract_text()
+
+
+@pytest.mark.enable_socket
+def test_extract_text__with_visitor_text():
+    def visitor_text(*args, **kwargs):  # noqa: ANN002, ANN003, ANN202
+        pass
+
+    url = "https://github.com/user-attachments/files/18381718/tika-952016.pdf"
+    name = "tika-952016.pdf"
+    stream = BytesIO(get_data_from_url(url, name=name))
+    reader = PdfReader(stream)
+    page = reader.pages[0]
+    page.extract_text(visitor_text=visitor_text)
+
+    reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf")))
+    page = reader.pages[0]
+    page.extract_text(visitor_text=visitor_text)
+
+
+@pytest.mark.enable_socket
+def test_extract_text__restore_cm_stack_pop_error():
+    url = "https://github.com/user-attachments/files/18381737/tika-966635.pdf"
+    name = "tika-966635.pdf"
+    stream = BytesIO(get_data_from_url(url, name=name))
+    reader = PdfReader(stream)
+    page = reader.pages[10]
+
+    # There is a previous error we already omit ("pop from empty list"), thus
+    # check for the message explicitly here.
+    with pytest.raises(IndexError, match="list index out of range"):
+        page.extract_text()
diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -2769,3 +2769,16 @@ def test_insert_filtered_annotations__annotations_are_no_list(caplog):
             "'/Normal'} of type DictionaryObject."
         )
     ]
+
+
+def test_unterminated_object__with_incremental_writer():
+    """Test for #3118"""
+    reader = PdfReader(RESOURCE_ROOT / "bytes.pdf")
+    writer = PdfWriter(reader, incremental=True)
+
+    writer.add_blank_page(72, 72)
+
+    fi = BytesIO()
+    writer.write(fi)
+    b = fi.getvalue()
+    assert b[-39:] == b"\nendstream\nendobj\nstartxref\n1240\n%%EOF\n"

Original file line number	Diff line number	Diff line change
`@@ -1495,7 +1495,7 @@ def _write_increment(self, stream: StreamType) -> None:`
`1495`	`1495`	`)`
`1496`	`1496`	`)`
`1497`	`1497`	`xr.write_to_stream(stream)`
`1498`		`- stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof`
	`1498`	`+ stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof`
`1499`	`1499`
`1500`	`1500`	`def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:`
`1501`	`1501`	`object_positions = []`