Skip to content

Commit 2cb0d76

Browse files
authored
Merge pull request #559 from funstory-ai/dev
Initial support for Type 3 fonts & improved PDF compatibility
2 parents 258a92b + eb6de30 commit 2cb0d76

File tree

9 files changed

+73
-21
lines changed

9 files changed

+73
-21
lines changed

babeldoc/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.22"
1+
__version__ = "0.5.23"

babeldoc/const.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import threading
77
from pathlib import Path
88

9-
__version__ = "0.5.22"
9+
__version__ = "0.5.23"
1010

1111
CACHE_FOLDER = Path.home() / ".cache" / "babeldoc"
1212

babeldoc/format/pdf/babelpdf/cidfont.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,12 @@ def get_descendant_fonts(doc, xref):
4343
def get_glyph_bbox(face, g):
4444
try:
4545
face.load_glyph(g, freetype.FT_LOAD_NO_SCALE)
46-
cbox = face.glyph.outline.get_bbox()
47-
return cbox.xMin, cbox.yMin, cbox.xMax, cbox.yMax
46+
outline = face.glyph.outline
47+
if outline.contours:
48+
cbox = outline.get_bbox()
49+
return cbox.xMin, cbox.yMin, cbox.xMax, cbox.yMax
50+
else:
51+
return 0, 0, 0, 0
4852
except Exception:
4953
return 0, 0, 0, 0
5054

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import io
2+
import re
3+
4+
import pymupdf
5+
6+
7+
def merge_bbox(bbox_list, factor=1):
8+
if bbox_list:
9+
base = bbox_list[0]
10+
for bbox in bbox_list[1:]:
11+
base.include_rect(bbox)
12+
x0, y0, x1, y1 = [v / factor for v in tuple(base)]
13+
return x0, -y1, x1, -y0
14+
15+
16+
def get_type3_bbox(doc, obj):
17+
bbox_list = [(0, 0, 0, 0)] * 256
18+
first = int(doc.xref_get_key(obj, "FirstChar")[1])
19+
last = int(doc.xref_get_key(obj, "LastChar")[1])
20+
factor_text = doc.xref_get_key(obj, "FontMatrix")[1]
21+
factor = 1
22+
if factor_m := re.search(r"(\d+)?\.\d+", factor_text):
23+
factor = float(factor_m.group(0))
24+
page = doc.new_page(width=10, height=10)
25+
doc.xref_set_key(page.xref, "Resources", "<<>>")
26+
doc.xref_set_key(page.xref, "Resources/Font", f"<</T0 {obj} 0 R>>")
27+
text = doc.get_new_xref()
28+
doc.update_object(text, "<<>>")
29+
for x in range(first, last + 1):
30+
doc.update_stream(text, b"1 0 0 1 0 10 cm BT /T0 1 Tf <%02X> Tj ET" % x)
31+
doc.xref_set_key(page.xref, "Contents", f"{text} 0 R")
32+
char_data = page.get_svg_image(text_as_path=True)
33+
char_doc = pymupdf.Document(stream=io.BytesIO(char_data.encode("U8")))
34+
char_bbox = []
35+
for element in char_doc:
36+
for item in element.get_drawings():
37+
char_bbox.append(item["rect"])
38+
if char_bbox_merged := merge_bbox(char_bbox, factor):
39+
bbox_list[x] = char_bbox_merged
40+
doc.delete_page(-1)
41+
return bbox_list

babeldoc/format/pdf/document_il/backend/pdf_creater.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -485,10 +485,13 @@ def reproduce_cmap(doc):
485485
assert doc
486486
font_set = set()
487487
for page in doc:
488-
font_list = page.get_fonts()
489-
for font in font_list:
490-
if font[1] == "ttf" and font[3] in FONT_NAMES and ".ttf" in font[4]:
491-
font_set.add(font)
488+
try:
489+
font_list = page.get_fonts()
490+
for font in font_list:
491+
if font[1] == "ttf" and font[3] in FONT_NAMES and ".ttf" in font[4]:
492+
font_set.add(font)
493+
except Exception as e:
494+
logger.error(f"Error in getting page fonts: {e}")
492495
for font in font_set:
493496
reproduce_one_font(doc, font[0])
494497
return doc

babeldoc/format/pdf/document_il/frontend/il_creater.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
import babeldoc.pdfminer.pdfinterp
1616
from babeldoc.format.pdf.babelpdf.base14 import get_base14_bbox
1717
from babeldoc.format.pdf.babelpdf.cidfont import get_cidfont_bbox
18+
from babeldoc.format.pdf.babelpdf.cidfont import get_glyph_bbox
1819
from babeldoc.format.pdf.babelpdf.encoding import WinAnsiEncoding
1920
from babeldoc.format.pdf.babelpdf.encoding import get_type1_encoding
21+
from babeldoc.format.pdf.babelpdf.type3 import get_type3_bbox
2022
from babeldoc.format.pdf.babelpdf.utils import guarded_bbox
2123
from babeldoc.format.pdf.document_il import il_version_1
2224
from babeldoc.format.pdf.document_il.utils import zstd_helper
@@ -111,23 +113,17 @@ def indirect(obj):
111113
return int(obj[1].split(" ")[0])
112114

113115

114-
def get_glyph_cbox(face, g):
115-
face.load_glyph(g, freetype.FT_LOAD_NO_SCALE)
116-
cbox = face.glyph.outline.get_bbox()
117-
return cbox.xMin, cbox.yMin, cbox.xMax, cbox.yMax
118-
119-
120116
def get_char_cbox(face, idx):
121117
g = face.get_char_index(idx)
122-
return get_glyph_cbox(face, g)
118+
return get_glyph_bbox(face, g)
123119

124120

125121
def get_name_cbox(face, name):
126122
if name:
127123
if isinstance(name, str):
128124
name = name.encode("utf-8")
129125
g = face.get_name_index(name)
130-
return get_glyph_cbox(face, g)
126+
return get_glyph_bbox(face, g)
131127
return (0, 0, 0, 0)
132128

133129

@@ -798,6 +794,8 @@ def parse_font_xobj_id(self, xobj_id: int):
798794
bbox_list = get_base14_bbox(obj_val[1:])
799795
if cid_bbox := get_cidfont_bbox(self.mupdf, xobj_id):
800796
bbox_list = cid_bbox
797+
if self.mupdf.xref_get_key(xobj_id, "Subtype")[1] == "/Type3":
798+
bbox_list = get_type3_bbox(self.mupdf, xobj_id)
801799
return bbox_list, cmap
802800

803801
def create_graphic_state(

babeldoc/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from babeldoc.translator.translator import set_translate_rate_limiter
2727

2828
logger = logging.getLogger(__name__)
29-
__version__ = "0.5.22"
29+
__version__ = "0.5.23"
3030

3131

3232
def create_parser():

babeldoc/translator/translator.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,15 @@ def update_token_count(self, response):
344344
self.prompt_token_count.inc(response.usage.prompt_tokens)
345345
if response.usage and response.usage.completion_tokens:
346346
self.completion_token_count.inc(response.usage.completion_tokens)
347-
if response.usage and (
348-
hit_count := getattr(response.usage, "prompt_cache_hit_tokens", 0)
347+
# Support both response.usage.prompt_cache_hit_tokens and response.prompt_tokens_details.cached_tokens
348+
hit_count = 0
349+
if response.usage and hasattr(response.usage, "prompt_cache_hit_tokens"):
350+
hit_count = getattr(response.usage, "prompt_cache_hit_tokens", 0)
351+
if hasattr(response, "prompt_tokens_details") and getattr(
352+
response.prompt_tokens_details, "cached_tokens", 0
349353
):
354+
hit_count += getattr(response.prompt_tokens_details, "cached_tokens", 0)
355+
if hit_count:
350356
self.cache_hit_prompt_token_count.inc(hit_count)
351357
except Exception as e:
352358
logger.exception("Error updating token count")

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "BabelDOC"
3-
version = "0.5.22"
3+
version = "0.5.23"
44
description = "Yet Another Document Translator"
55
license = "AGPL-3.0"
66
readme = "README.md"
@@ -162,7 +162,7 @@ pythonpath = [".", "src"]
162162
testpaths = ["tests"]
163163

164164
[bumpver]
165-
current_version = "0.5.22"
165+
current_version = "0.5.23"
166166
version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]"
167167

168168
[bumpver.file_patterns]

0 commit comments

Comments
 (0)