Skip to content

Commit 347bdf4

Browse files
authored
Merge branch 'main' into refactor-page-copilot
2 parents 1ed2e38 + af645a4 commit 347bdf4

File tree

6 files changed

+25
-10
lines changed

6 files changed

+25
-10
lines changed

pypdf/_cmap.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import binascii
2+
from binascii import Error as BinasciiError
23
from binascii import unhexlify
34
from math import ceil
45
from typing import Any, Dict, List, Tuple, Union, cast
@@ -383,9 +384,12 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
383384
map_to = ""
384385
# placeholder (see above) means empty string
385386
if lst[1] != b".":
386-
map_to = unhexlify(lst[1]).decode(
387-
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
388-
) # join is here as some cases where the code was split
387+
try:
388+
map_to = unhexlify(lst[1]).decode(
389+
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
390+
) # join is here as some cases where the code was split
391+
except BinasciiError as exception:
392+
logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)
389393
map_dict[
390394
unhexlify(lst[0]).decode(
391395
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"

pypdf/_page.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1673,7 +1673,6 @@ def _extract_text(
16731673
default = "/Content"
16741674
16751675
"""
1676-
# Use the new TextExtraction class
16771676
extractor = TextExtraction(
16781677
self, # Pass the page object for font width maps
16791678
obj,

pypdf/_text_extraction/_text_extractor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,3 +558,4 @@ def _get_actual_font_widths(
558558
font_widths += compute_font_width(font_width_map, char)
559559

560560
return (font_widths * font_size, space_width * font_size, font_size)
561+

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[build-system]
2-
requires = ["flit_core >=3.9,<4"]
2+
requires = ["flit_core >=3.11,<4"]
33
build-backend = "flit_core.buildapi"
44

55
[project]
@@ -9,12 +9,12 @@ maintainers = [{ name = "stefan6419846" }, { name = "Martin Thoma", email = "inf
99
description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files"
1010
readme = "README.md"
1111
dynamic = ["version"]
12-
license = { file = "LICENSE" }
12+
license = "BSD-3-Clause"
13+
license-files = ["LICENSE"]
1314
requires-python = ">=3.8"
1415
classifiers = [
1516
"Development Status :: 5 - Production/Stable",
1617
"Intended Audience :: Developers",
17-
"License :: OSI Approved :: BSD License",
1818
"Programming Language :: Python :: 3",
1919
"Programming Language :: Python :: 3 :: Only",
2020
"Programming Language :: Python :: 3.8",

requirements/dev.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ exceptiongroup==1.2.2
2424
# via pytest
2525
filelock==3.16.1
2626
# via virtualenv
27-
flit==3.9.0
27+
flit==3.11.0
2828
# via -r requirements/dev.in
29-
flit-core==3.9.0
29+
flit-core==3.11.0
3030
# via flit
3131
identify==2.6.1
3232
# via pre-commit

tests/test_cmap.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pytest
66

77
from pypdf import PdfReader, PdfWriter
8-
from pypdf._cmap import build_char_map, get_encoding
8+
from pypdf._cmap import build_char_map, get_encoding, parse_bfchar
99
from pypdf._codecs import charset_encoding
1010
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NullObject
1111

@@ -327,3 +327,14 @@ def test_get_encoding__encoding_value_is_none():
327327
dict(zip(range(256), charset_encoding["/StandardEncoding"])),
328328
{}
329329
)
330+
331+
332+
def test_parse_bfchar(caplog):
333+
map_dict = {}
334+
int_entry = []
335+
parse_bfchar(line=b"057e 1337", map_dict=map_dict, int_entry=int_entry)
336+
parse_bfchar(line=b"056e 1f310", map_dict=map_dict, int_entry=int_entry)
337+
338+
assert map_dict == {-1: 2, "ծ": "", "վ": "ጷ"}
339+
assert int_entry == [1406, 1390]
340+
assert caplog.messages == ["Got invalid hex string: Odd-length string (b'1f310')"]

0 commit comments

Comments
 (0)