-
Notifications
You must be signed in to change notification settings - Fork 1.5k
ENH: Add support for BrotliDecode filter (PDF 2.0) #3223 #3254
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 17 commits
025226a
97fc4a4
7da38f1
5b7ac7e
50c230d
42b9efe
2d03292
9c064d3
cbbea23
3ba2235
7dcd2e9
3935ea2
b4ac3d9
666b871
48cee95
94b6485
9b8b80a
e099ead
ab1c492
dc2b4db
fd842b3
1016c29
b0bf326
f143805
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,3 +13,4 @@ pytest-cov | |
| typeguard | ||
| types-Pillow | ||
| pyyaml | ||
| brotli | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,3 +4,4 @@ pre-commit | |
| pytest-cov | ||
| flit | ||
| wheel | ||
| brotli | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| #!/usr/bin/env python | ||
| """ | ||
| Create a minimal PDF with Brotli compression for testing purposes. | ||
|
|
||
| This script generates a simple PDF file that uses Brotli compression | ||
| for the content stream, allowing for testing of the BrotliDecode filter | ||
| in pypdf. | ||
|
|
||
| Note: /BrotliDecode is not a standard PDF filter. This file is specifically | ||
| for testing PDF library support for this filter (e.g., in pypdf). | ||
| Standard PDF viewers will likely not render this file correctly. | ||
| """ | ||
|
|
||
| import logging | ||
| from pathlib import Path | ||
|
|
||
| import brotli | ||
|
|
||
| logging.basicConfig(level=logging.INFO, format="%(name)s: %(levelname)s: %(message)s") | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
| content_stream = b"BT /F1 24 Tf 100 700 Td (Hello, Brotli!) Tj ET" | ||
| compressed_content = brotli.compress(content_stream, quality=5) | ||
|
|
||
| xref_offsets = [0] * 6 | ||
| current_offset = 0 | ||
| pdf_parts = [] | ||
|
|
||
| part = b"%PDF-1.7\n%\xc2\xa5\xc2\xb1\xc3\xab\xc3\xbf\n" # Binary marker | ||
| pdf_parts.append(part) | ||
| current_offset += len(part) | ||
| xref_offsets[1] = current_offset | ||
|
|
||
| part = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n" | ||
| pdf_parts.append(part) | ||
| current_offset += len(part) | ||
| xref_offsets[2] = current_offset | ||
|
|
||
| part = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n" | ||
| pdf_parts.append(part) | ||
| current_offset += len(part) | ||
| xref_offsets[3] = current_offset | ||
|
|
||
| part = ( | ||
| b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " | ||
| b"/Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\nendobj\n" | ||
| ) | ||
| pdf_parts.append(part) | ||
| current_offset += len(part) | ||
| xref_offsets[4] = current_offset | ||
|
|
||
| part_header = ( | ||
| f"4 0 obj\n<< /Length {len(compressed_content)} /Filter /BrotliDecode >>\nstream\n" | ||
| ).encode("ascii") | ||
| part_footer = b"\nendstream\nendobj\n" | ||
| pdf_parts.append(part_header) | ||
| pdf_parts.append(compressed_content) | ||
| pdf_parts.append(part_footer) | ||
| current_offset += len(part_header) + len(compressed_content) + len(part_footer) | ||
| xref_offsets[5] = current_offset | ||
|
|
||
| part = b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n" | ||
| pdf_parts.append(part) | ||
| current_offset += len(part) | ||
| xref_table_start_offset = current_offset | ||
|
|
||
| xref_lines = [b"xref\n0 6\n", b"0000000000 65535 f \n"] | ||
| xref_lines.extend( | ||
| f"{xref_offsets[i]:010d} 00000 n \n".encode("ascii") for i in range(1, 6) | ||
| ) | ||
| pdf_parts.extend(xref_lines) | ||
|
|
||
| trailer = ( | ||
| f"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_table_start_offset}\n%%EOF" | ||
| ).encode("ascii") | ||
| pdf_parts.append(trailer) | ||
|
|
||
| script_path = Path(__file__).resolve() | ||
| output_dir = script_path.parent / "brotli-test-pdfs" | ||
| output_path = output_dir / "minimal-brotli-compressed.pdf" | ||
|
|
||
| output_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| try: | ||
| with open(output_path, "wb") as f: | ||
| for part in pdf_parts: | ||
| f.write(part) | ||
| logger.info(f"Created test PDF with Brotli compression at: {output_path}") | ||
| except OSError: | ||
| logger.exception("Error writing PDF file") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,20 +1,29 @@ | ||
| """Test the pypdf.filters module.""" | ||
|
|
||
| import builtins | ||
| import importlib.util | ||
| import os | ||
| import shutil | ||
| import string | ||
| import subprocess | ||
| from io import BytesIO | ||
| from itertools import product as cartesian_product | ||
| from pathlib import Path | ||
| from typing import TYPE_CHECKING | ||
| from unittest.mock import patch | ||
|
|
||
| import pytest | ||
|
|
||
| if TYPE_CHECKING: | ||
| import types | ||
| from PIL import Image, ImageOps | ||
|
|
||
| from pypdf import PdfReader | ||
| from pypdf.errors import DeprecationError, PdfReadError | ||
| from pypdf.filters import ( | ||
| ASCII85Decode, | ||
| ASCIIHexDecode, | ||
| BrotliDecode, | ||
| CCITParameters, | ||
| CCITTFaxDecode, | ||
| CCITTParameters, | ||
|
|
@@ -52,6 +61,50 @@ def test_flate_decode_encode(predictor, s): | |
| assert codec.decode(encoded, DictionaryObject({"/Predictor": predictor})) == s | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("s", filter_inputs) | ||
| def test_brotli_decode_encode(s): | ||
| """BrotliDecode encode() and decode() methods work as expected.""" | ||
| codec = BrotliDecode() | ||
| s_bytes = s.encode() | ||
| encoded = codec.encode(s_bytes) | ||
| assert encoded != s_bytes # Ensure encoding actually happened | ||
| decoded = codec.decode(encoded) | ||
| assert decoded == s_bytes | ||
|
|
||
ash01ish marked this conversation as resolved.
Show resolved
Hide resolved
stefan6419846 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| @patch("pypdf.filters.brotli", None) | ||
ash01ish marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def test_brotli_missing_installation(): | ||
| """Verify BrotliDecode raises ImportError if brotli is not installed.""" | ||
| from pypdf.filters import BrotliDecode, decode_stream_data | ||
ash01ish marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Test direct decode call | ||
| codec = BrotliDecode() | ||
| with pytest.raises(ImportError) as exc_info_decode: | ||
| codec.decode(b"test data") | ||
| assert "Brotli library not installed" in str(exc_info_decode.value) | ||
|
|
||
| # Test direct encode call | ||
| with pytest.raises(ImportError) as exc_info_encode: | ||
| codec.encode(b"test data") | ||
| assert "Brotli library not installed" in str(exc_info_encode.value) | ||
|
|
||
| # Test call via decode_stream_data | ||
| stream = DictionaryObject() | ||
| stream[NameObject("/Filter")] = NameObject("/BrotliDecode") | ||
| stream._data = b"dummy compressed data" | ||
| with pytest.raises(ImportError) as exc_info_stream: | ||
| decode_stream_data(stream) | ||
| assert "Brotli library not installed" in str(exc_info_stream.value) | ||
|
|
||
|
|
||
| @pytest.mark.skipif(importlib.util.find_spec("brotli") is None, reason="brotli library not installed") | ||
ash01ish marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| def test_brotli_import_handling_available(): | ||
| """Verify brotli module is properly imported in pypdf.filters when available.""" | ||
| # This test only runs when brotli is actually installed | ||
stefan6419846 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| from pypdf import filters | ||
ash01ish marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| assert filters.brotli is not None, "brotli should be imported when available" | ||
|
|
||
|
|
||
| def test_flatedecode_unsupported_predictor(): | ||
| """ | ||
| FlateDecode raises PdfReadError for unsupported predictors. | ||
|
|
@@ -226,9 +279,7 @@ def get_object(self, reference) -> NumberObject: | |
|
|
||
| def test_ccitt_fax_decode(): | ||
| data = b"" | ||
| parameters = DictionaryObject( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As mentioned previously, please revert all unrelated changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. I have staged a commit which will revert of the changes - It seems I have run ruff fix and everything formatted. Apologies - this will be fixed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This still seems to be unresolved? |
||
| {"/K": NumberObject(-1), "/Columns": NumberObject(17)} | ||
| ) | ||
| parameters = DictionaryObject({"/K": NumberObject(-1), "/Columns": NumberObject(17)}) | ||
|
|
||
| # This was just the result pypdf 1.27.9 returned. | ||
| # It would be awesome if we could check if that is actually correct. | ||
|
|
@@ -689,3 +740,34 @@ def test_flate_decode__not_rectangular(caplog): | |
| expected = get_data_from_url(url, name=name) | ||
| assert actual_image.getvalue() == expected | ||
| assert caplog.messages == ["Image data is not rectangular. Adding padding."] | ||
|
|
||
ash01ish marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
stefan6419846 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def test_main_decode_brotli_installed(): | ||
| """Test the main decode function with Brotli filter using a real PDF.""" | ||
| if importlib.util.find_spec("brotli") is None: | ||
| pytest.skip("brotli library not installed") | ||
|
|
||
| pdf_path = RESOURCE_ROOT / "brotli-test-pdfs" / "minimal-brotli-compressed.pdf" | ||
|
|
||
| reader = PdfReader(pdf_path) | ||
ash01ish marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| page = reader.pages[0] | ||
| extracted_text = page.extract_text() | ||
|
|
||
| assert extracted_text.strip() == "Hello, Brotli!" | ||
|
|
||
ash01ish marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def test_brotli_import_error_with_patch(): | ||
stefan6419846 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| original_import = builtins.__import__ | ||
|
|
||
| def mock_import(name, globals=None, locals=None, fromlist=(), level=0) -> "types.ModuleType | ImportError": | ||
| if name == "brotli": | ||
| raise ImportError("Simulated brotli import error") | ||
| return original_import(name, globals, locals, fromlist, level) | ||
|
|
||
| with patch("builtins.__import__", side_effect=mock_import): | ||
| importlib.reload(importlib.import_module("pypdf.filters")) | ||
| from pypdf.filters import BrotliDecode | ||
|
|
||
| assert BrotliDecode is not None | ||
| assert importlib.import_module("pypdf.filters").brotli is None | ||
|
|
||
| importlib.reload(importlib.import_module("pypdf.filters")) | ||
Uh oh!
There was an error while loading. Please reload this page.