diff --git a/pypdf/constants.py b/pypdf/constants.py index cf96ea6494..ee9dd12a54 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -245,6 +245,7 @@ class FilterTypes(StrEnum): CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF DCT_DECODE = "/DCTDecode" # abbreviation: DCT JPX_DECODE = "/JPXDecode" + BROTLI_DECODE = "/BrotliDecode" # abbreviation: Br, PDF 2.0 JBIG2_DECODE = "/JBIG2Decode" @@ -258,6 +259,7 @@ class FilterTypeAbbreviations: RL = "/RL" CCF = "/CCF" DCT = "/DCT" + BR = "/Br" class LzwFilterParameters: diff --git a/pypdf/filters.py b/pypdf/filters.py index 49719a1794..484569e7ef 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -72,6 +72,11 @@ is_null_or_none, ) +try: + import brotli +except ImportError: + brotli = None + def decompress(data: bytes) -> bytes: """ @@ -513,6 +518,68 @@ def decode( return data +class BrotliDecode: + """ + Decompress the given data using Brotli. + + Decodes data that has been encoded using the Brotli compression algorithm. + Brotli is a general-purpose lossless compression algorithm that combines + LZ77 and Huffman coding. It typically achieves better compression ratios + than Flate encoding, though with slightly slower compression speeds. + + See ISO 32000-2:2020, Section 7.4.11. + + Args: + data: The input data to be decompressed. + decode_parms: Optional decoding parameters (currently unused). + **kwargs: Additional keyword arguments (currently unused). + + Returns: + The decompressed data. + """ + @staticmethod + def decode( + data: bytes, + decode_parms: Optional[DictionaryObject] = None, + **kwargs: Any, + ) -> bytes: + """ + Decode Brotli-compressed data. + + Args: + data: Brotli-compressed data. + decode_parms: A dictionary of parameter values (unused). + + Returns: + The decompressed data. + + Raises: + ImportError: If the 'brotli' library is not installed. + """ + if brotli is None: + raise ImportError("Brotli library not installed. Required for BrotliDecode filter.") + return brotli.decompress(data) + + @staticmethod + def encode(data: bytes, **kwargs: Any) -> bytes: + """ + Encode data using Brotli compression. + + Args: + data: The data to be compressed. + **kwargs: Additional keyword arguments (unused). + + Returns: + The compressed data. + + Raises: + ImportError: If the 'brotli' library is not installed. + """ + if brotli is None: + raise ImportError("Brotli library not installed. Required for BrotliDecode filter.") + return brotli.compress(data) + + @dataclass class CCITTParameters: """ยง7.4.6, optional parameters for the CCITTFaxDecode filter.""" @@ -759,6 +826,8 @@ def decode_stream_data(stream: Any) -> bytes: data = DCTDecode.decode(data) elif filter_name == FT.JPX_DECODE: data = JPXDecode.decode(data) + elif filter_name == FT.BROTLI_DECODE: + data = BrotliDecode.decode(data) elif filter_name == FT.JBIG2_DECODE: data = JBIG2Decode.decode(data, params) elif filter_name == "/Crypt": diff --git a/pyproject.toml b/pyproject.toml index dc5c49ed75..41ee094c65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,9 +42,11 @@ Source = "https://github.com/py-pdf/pypdf" crypto = ["cryptography"] cryptodome = ["PyCryptodome"] image = ["Pillow>=8.0.0"] +brotli = ["brotli"] full = [ "cryptography", - "Pillow>=8.0.0" + "Pillow>=8.0.0", + "brotli", ] dev = [ "black", diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index 9013253e60..8bb49e2b7b 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -4,6 +4,8 @@ # # pip-compile --output-file=requirements/ci-3.11.txt requirements/ci.in # +brotli==1.1.0 + # via -r requirements/ci.in cffi==1.17.1 # via cryptography coverage[toml]==7.6.4 diff --git a/requirements/ci.in b/requirements/ci.in index 50b58cd3d2..7bb0b4bfbe 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -13,3 +13,4 @@ pytest-cov typeguard types-Pillow pyyaml +brotli diff --git a/requirements/ci.txt b/requirements/ci.txt index 1589d89d1b..a9b413d135 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -4,6 +4,8 @@ # # pip-compile requirements/ci.in # +brotli==1.1.0 + # via -r requirements/ci.in cffi==1.17.1 # via cryptography coverage[toml]==7.6.1 diff --git a/requirements/dev.in b/requirements/dev.in index 6b54803052..0c961d4f39 100644 --- a/requirements/dev.in +++ b/requirements/dev.in @@ -4,3 +4,4 @@ pre-commit pytest-cov flit wheel +brotli diff --git a/requirements/dev.txt b/requirements/dev.txt index 2cfd589a9e..e6310d646f 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -4,6 +4,8 @@ # # pip-compile requirements/dev.in # +brotli==1.1.0 + # via -r requirements/ci.in build==1.2.2.post1 # via pip-tools certifi==2024.8.30 diff --git a/resources/brotli-test-pdfs/minimal-brotli-compressed.pdf b/resources/brotli-test-pdfs/minimal-brotli-compressed.pdf new file mode 100644 index 0000000000..8ec36f926c Binary files /dev/null and b/resources/brotli-test-pdfs/minimal-brotli-compressed.pdf differ diff --git a/resources/create_brotli_test_pdf.py b/resources/create_brotli_test_pdf.py new file mode 100755 index 0000000000..c3509f41da --- /dev/null +++ b/resources/create_brotli_test_pdf.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +""" +Create a minimal PDF with Brotli compression for testing purposes. + +This script generates a simple PDF file that uses Brotli compression +for the content stream, allowing for testing of the BrotliDecode filter +in pypdf. + +Note: /BrotliDecode is not a standard PDF filter. This file is specifically +for testing PDF library support for this filter (e.g., in pypdf). +Standard PDF viewers will likely not render this file correctly. +""" + +import logging +from pathlib import Path + +import brotli + +logging.basicConfig(level=logging.INFO, format="%(name)s: %(levelname)s: %(message)s") +logger = logging.getLogger(__name__) + +content_stream = b"BT /F1 24 Tf 100 700 Td (Hello, Brotli!) Tj ET" +compressed_content = brotli.compress(content_stream, quality=5) + +xref_offsets = [0] * 6 +current_offset = 0 +pdf_parts = [] + +part = b"%PDF-1.7\n%\xc2\xa5\xc2\xb1\xc3\xab\xc3\xbf\n" # Binary marker +pdf_parts.append(part) +current_offset += len(part) +xref_offsets[1] = current_offset + +part = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n" +pdf_parts.append(part) +current_offset += len(part) +xref_offsets[2] = current_offset + +part = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n" +pdf_parts.append(part) +current_offset += len(part) +xref_offsets[3] = current_offset + +part = ( + b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " + b"/Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\nendobj\n" +) +pdf_parts.append(part) +current_offset += len(part) +xref_offsets[4] = current_offset + +part_header = ( + f"4 0 obj\n<< /Length {len(compressed_content)} /Filter /BrotliDecode >>\nstream\n" +).encode("ascii") +part_footer = b"\nendstream\nendobj\n" +pdf_parts.append(part_header) +pdf_parts.append(compressed_content) +pdf_parts.append(part_footer) +current_offset += len(part_header) + len(compressed_content) + len(part_footer) +xref_offsets[5] = current_offset + +part = b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n" +pdf_parts.append(part) +current_offset += len(part) +xref_table_start_offset = current_offset + +xref_lines = [b"xref\n0 6\n", b"0000000000 65535 f \n"] +xref_lines.extend( + f"{xref_offsets[i]:010d} 00000 n \n".encode("ascii") for i in range(1, 6) +) +pdf_parts.extend(xref_lines) + +trailer = ( + f"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_table_start_offset}\n%%EOF" +).encode("ascii") +pdf_parts.append(trailer) + +script_path = Path(__file__).resolve() +output_dir = script_path.parent / "brotli-test-pdfs" +output_path = output_dir / "minimal-brotli-compressed.pdf" + +output_dir.mkdir(parents=True, exist_ok=True) + +try: + with open(output_path, "wb") as f: + for part in pdf_parts: + f.write(part) + logger.info(f"Created test PDF with Brotli compression at: {output_path}") +except OSError: + logger.exception("Error writing PDF file") diff --git a/tests/test_filters.py b/tests/test_filters.py index 3fd47a0062..7bd5a065ab 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -1,4 +1,5 @@ """Test the pypdf.filters module.""" + import os import shutil import string @@ -7,6 +8,7 @@ from itertools import product as cartesian_product from pathlib import Path from unittest import mock +from unittest.mock import patch import pytest from PIL import Image, ImageOps @@ -16,6 +18,7 @@ from pypdf.filters import ( ASCII85Decode, ASCIIHexDecode, + BrotliDecode, CCITParameters, CCITTFaxDecode, CCITTParameters, @@ -34,6 +37,12 @@ from . import PILContext, get_data_from_url from .test_encryption import HAS_AES + +try: + import brotli # noqa: F401 + HAS_BROTLI = True +except ImportError: + HAS_BROTLI = False from .test_images import image_similarity filter_inputs = ( @@ -62,6 +71,52 @@ def test_flate_decode_encode(predictor, s): assert codec.decode(encoded, DictionaryObject({"/Predictor": predictor})) == s +@pytest.mark.parametrize("s", filter_inputs) +@pytest.mark.skipif(not HAS_BROTLI, reason="brotli library not installed") +def test_brotli_decode_encode(s): + + codec = BrotliDecode() + s_bytes = s.encode() + encoded = codec.encode(s_bytes) + assert encoded != s_bytes # Ensure encoding actually happened + decoded = codec.decode(encoded) + assert decoded == s_bytes + + +@patch("pypdf.filters.brotli", None) +def test_brotli_missing_installation(): + from pypdf.filters import BrotliDecode, decode_stream_data # noqa: PLC0415 + + # Test direct decode call + codec = BrotliDecode() + with pytest.raises(ImportError) as exc_info_decode: + codec.decode(b"test data") + assert "Brotli library not installed" in str(exc_info_decode.value) + + # Test direct encode call + with pytest.raises(ImportError) as exc_info_encode: + codec.encode(b"test data") + assert "Brotli library not installed" in str(exc_info_encode.value) + + # Test call via decode_stream_data + stream = DictionaryObject() + stream[NameObject("/Filter")] = NameObject("/BrotliDecode") + stream._data = b"dummy compressed data" + with pytest.raises(ImportError) as exc_info_stream: + decode_stream_data(stream) + assert "Brotli library not installed" in str(exc_info_stream.value) + + +@pytest.mark.skipif(not HAS_BROTLI, reason="brotli library not installed") +def test_brotli_decode_encode_with_real_module(): + + s = b"Hello, Brotli!" + codec = BrotliDecode() + encoded = codec.encode(s) + assert encoded != s # Ensure encoding actually happened + assert codec.decode(encoded) == s + + def test_flatedecode_unsupported_predictor(): """ FlateDecode raises PdfReadError for unsupported predictors. @@ -383,7 +438,9 @@ def test_iss1787(): obj = data.indirect_reference.get_object() obj["/DecodeParms"][NameObject("/Columns")] = NumberObject(1000) obj.decoded_self = None - with pytest.raises(expected_exception=PdfReadError, match="^Unsupported PNG filter 244$"): + with pytest.raises( + expected_exception=PdfReadError, match="^Unsupported PNG filter 244$" + ): _ = reader.pages[0].images[0] @@ -700,7 +757,9 @@ def test_flate_decode__not_rectangular(caplog): decode_parms[NameObject("/Columns")] = NumberObject(4881) actual = FlateDecode.decode(data=data, decode_parms=decode_parms) actual_image = BytesIO() - Image.frombytes(mode="1", size=(4881, 81), data=actual).save(actual_image, format="png") + Image.frombytes(mode="1", size=(4881, 81), data=actual).save( + actual_image, format="png" + ) url = "https://github.com/user-attachments/assets/c5695850-c076-4255-ab72-7c86851a4a04" name = "issue3241.png" @@ -709,6 +768,23 @@ def test_flate_decode__not_rectangular(caplog): assert caplog.messages == ["Image data is not rectangular. Adding padding."] +@pytest.mark.skipif(not HAS_BROTLI, reason="brotli library not installed") +def test_main_decode_brotli_installed(): + + pdf_path = RESOURCE_ROOT / "brotli-test-pdfs" / "minimal-brotli-compressed.pdf" + + reader = PdfReader(pdf_path) + page = reader.pages[0] + + # This test specifically exercises the BrotliDecode path in decode_stream_data function + # when processing a real PDF with BrotliDecode filter + extracted_text = page.extract_text() + + assert extracted_text.strip() == "Hello, Brotli!" + + +def test_brotli_module_importability(): + assert BrotliDecode is not None def test_jbig2decode__binary_errors(): with mock.patch("pypdf.filters.JBIG2DEC_BINARY", None), \ pytest.raises(DependencyError, match="jbig2dec binary is not available."):