Skip to content

Commit 1b8b05f

Browse files
authored
Merge pull request #521 from funstory-ai/dev
feat: enhance valid character and token statistics tracking
2 parents c8bdc74 + a2bfdec commit 1b8b05f

File tree

8 files changed

+140
-9
lines changed

8 files changed

+140
-9
lines changed

babeldoc/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.16"
1+
__version__ = "0.5.17"

babeldoc/const.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import threading
77
from pathlib import Path
88

9-
__version__ = "0.5.16"
9+
__version__ = "0.5.17"
1010

1111
CACHE_FOLDER = Path.home() / ".cache" / "babeldoc"
1212

babeldoc/format/pdf/babelpdf/cidfont.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,12 @@ def get_descendant_fonts(doc, xref):
4141

4242

4343
def get_glyph_bbox(face, g):
44-
face.load_glyph(g, freetype.FT_LOAD_NO_SCALE)
45-
cbox = face.glyph.outline.get_bbox()
46-
return cbox.xMin, cbox.yMin, cbox.xMax, cbox.yMax
44+
try:
45+
face.load_glyph(g, freetype.FT_LOAD_NO_SCALE)
46+
cbox = face.glyph.outline.get_bbox()
47+
return cbox.xMin, cbox.yMin, cbox.xMax, cbox.yMax
48+
except Exception:
49+
return 0, 0, 0, 0
4750

4851

4952
def get_face_bbox(blob):

babeldoc/format/pdf/document_il/frontend/il_creater.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
import logging
44
import math
55
import re
6+
import unicodedata
67
from io import BytesIO
78
from itertools import islice
89
from typing import Literal
910

1011
import freetype
1112
import pymupdf
13+
import tiktoken
1214

1315
import babeldoc.pdfminer.pdfinterp
1416
from babeldoc.format.pdf.babelpdf.base14 import get_base14_bbox
@@ -18,6 +20,7 @@
1820
from babeldoc.format.pdf.babelpdf.utils import guarded_bbox
1921
from babeldoc.format.pdf.document_il import il_version_1
2022
from babeldoc.format.pdf.document_il.utils import zstd_helper
23+
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
2124
from babeldoc.format.pdf.document_il.utils.matrix_helper import decompose_ctm
2225
from babeldoc.format.pdf.document_il.utils.style_helper import BLACK
2326
from babeldoc.format.pdf.document_il.utils.style_helper import YELLOW
@@ -358,6 +361,10 @@ def __init__(self, translation_config: TranslationConfig):
358361
self.render_order = 0
359362
self.current_clip_paths: list[tuple] = []
360363
self.clip_paths_stack: list[list[tuple]] = []
364+
# For valid character collection
365+
self.font_mapper = FontMapper(translation_config)
366+
self.tokenizer = tiktoken.encoding_for_model("gpt-4o")
367+
self._page_valid_chars_buffer: list[str] | None = None
361368

362369
def transform_clip_path(
363370
self,
@@ -566,8 +573,32 @@ def on_page_start(self):
566573
self.current_clip_paths = []
567574
self.clip_paths_stack = []
568575
self.docs.page.append(self.current_page)
576+
# Prepare per-page buffer for valid characters on translated pages
577+
self._page_valid_chars_buffer = []
569578

570579
def on_page_end(self):
580+
# Accumulate this page's valid characters and tokens into shared context
581+
try:
582+
if (
583+
self._page_valid_chars_buffer is not None
584+
and len(self._page_valid_chars_buffer) > 0
585+
):
586+
page_text = "".join(self._page_valid_chars_buffer)
587+
char_count = len(page_text)
588+
try:
589+
token_count = len(
590+
self.tokenizer.encode(page_text, disallowed_special=())
591+
)
592+
except Exception as e:
593+
logger.warning("Failed to compute token count for page: %s", e)
594+
token_count = 0
595+
self.translation_config.shared_context_cross_split_part.add_valid_counts(
596+
char_count, token_count
597+
)
598+
except Exception as e:
599+
logger.warning("Failed to accumulate page valid stats: %s", e)
600+
finally:
601+
self._page_valid_chars_buffer = []
571602
self.progress.advance(1)
572603

573604
def on_page_crop_box(
@@ -848,6 +879,11 @@ def on_lt_char(self, char: LTChar):
848879
"Failed to get rotation angle for char %s",
849880
char.get_text(),
850881
)
882+
# Collect valid characters for statistics
883+
try:
884+
self._collect_valid_char(char.get_text())
885+
except Exception as e:
886+
logger.warning("Error collecting valid char: %s", e)
851887
gs = self.create_graphic_state(char.graphicstate)
852888
# Get font from current page or xobject
853889
font = None
@@ -983,6 +1019,43 @@ def on_lt_char(self, char: LTChar):
9831019
)
9841020
)
9851021

1022+
def _collect_valid_char(self, ch: str):
1023+
"""Append a valid character into the current page buffer according to rules.
1024+
Rules:
1025+
- Include whitespace matched by space_regex directly.
1026+
- Ignore categories that are never normal text: {Cc, Cs, Co, Cn}.
1027+
- Apply inverted criteria from formular_helper.py (21-28):
1028+
empty -> invalid, contains '(cid:' -> invalid,
1029+
not has_char(ch) -> invalid unless len(ch) > 1 and all(has_char(x)).
1030+
"""
1031+
if self._page_valid_chars_buffer is None:
1032+
return
1033+
if space_regex.match(ch):
1034+
self._page_valid_chars_buffer.append(ch)
1035+
return
1036+
try:
1037+
cat = unicodedata.category(ch[0]) if ch else None
1038+
except Exception:
1039+
cat = None
1040+
if cat in {"Cc", "Cs", "Co", "Cn"}:
1041+
return
1042+
is_invalid = False
1043+
if not ch:
1044+
is_invalid = True
1045+
elif "(cid:" in ch:
1046+
is_invalid = True
1047+
else:
1048+
try:
1049+
if not self.font_mapper.has_char(ch):
1050+
if len(ch) > 1 and all(self.font_mapper.has_char(x) for x in ch):
1051+
is_invalid = False
1052+
else:
1053+
is_invalid = True
1054+
except Exception:
1055+
is_invalid = True
1056+
if not is_invalid:
1057+
self._page_valid_chars_buffer.append(ch)
1058+
9861059
def on_lt_curve(self, curve: babeldoc.pdfminer.layout.LTCurve):
9871060
if not self.enable_graphic_element_process:
9881061
return

babeldoc/format/pdf/high_level.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,23 @@ def do_translate(
755755
logger.info(
756756
f"finish translate: {original_pdf_path}, cost: {finish_time - start_time} s",
757757
)
758+
# Populate aggregate valid text statistics into result
759+
try:
760+
sc = translation_config.shared_context_cross_split_part
761+
result.total_valid_character_count = getattr(
762+
sc, "valid_char_count_total", 0
763+
)
764+
token_total = getattr(sc, "total_valid_text_token_count", None)
765+
result.total_valid_text_token_count = (
766+
token_total if isinstance(token_total, int) else 0
767+
)
768+
except Exception as e:
769+
logger.warning("Failed to populate valid text statistics: %s", e)
770+
try:
771+
result.total_valid_character_count = 0
772+
result.total_valid_text_token_count = 0
773+
except Exception:
774+
pass
758775
result.original_pdf_path = translation_config.input_file
759776
result.peak_memory_usage = peak_memory_usage
760777

babeldoc/format/pdf/translation_config.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ def __init__(self):
3232
self.auto_extracted_glossary: Glossary | None = None
3333
self.raw_extracted_terms: list[tuple[str, str]] = []
3434
self.auto_enabled_ocr_workaround = False
35+
# Statistics for valid characters/text across the whole file
36+
self.valid_char_count_total: int = 0
37+
self.total_valid_text_token_count: int = 0
3538

3639
def initialize_glossaries(self, initial_glossaries: list[Glossary] | None):
3740
with self._lock:
@@ -45,6 +48,9 @@ def initialize_glossaries(self, initial_glossaries: list[Glossary] | None):
4548
for g in self.user_glossaries:
4649
for entity in g.normalized_lookup:
4750
self.norm_terms.add(entity)
51+
# reset statistics buffer when initializing
52+
self.valid_char_count_total = 0
53+
self.total_valid_text_token_count = 0
4854

4955
def add_raw_extracted_term_pair(self, src: str, tgt: str):
5056
with self._lock:
@@ -67,7 +73,11 @@ def _generate_unique_auto_glossary_name(self) -> str:
6773
return current_name
6874

6975
def contains_term(self, term: str) -> bool:
70-
pass
76+
with self._lock:
77+
try:
78+
return term in self.norm_terms
79+
except Exception:
80+
return False
7181

7282
def finalize_auto_extracted_glossary(self):
7383
with self._lock:
@@ -112,6 +122,16 @@ def get_glossaries_for_translation(
112122
all_glossaries.append(self.auto_extracted_glossary)
113123
return all_glossaries
114124

125+
def add_valid_counts(self, char_count: int, token_count: int):
126+
"""Accumulate valid character and token counts in a threadsafe way."""
127+
if char_count <= 0 and token_count <= 0:
128+
return
129+
with self._lock:
130+
if char_count > 0:
131+
self.valid_char_count_total += char_count
132+
if token_count > 0:
133+
self.total_valid_text_token_count += token_count
134+
115135

116136
class TranslationConfig:
117137
@staticmethod
@@ -463,6 +483,8 @@ class TranslateResult:
463483
no_watermark_dual_pdf_path: Path | None
464484
peak_memory_usage: int | None
465485
auto_extracted_glossary_path: Path | None
486+
total_valid_character_count: int | None
487+
total_valid_text_token_count: int | None
466488

467489
def __init__(
468490
self,
@@ -479,6 +501,8 @@ def __init__(
479501
self.no_watermark_dual_pdf_path = dual_pdf_path
480502

481503
self.auto_extracted_glossary_path = auto_extracted_glossary_path
504+
self.total_valid_character_count = None
505+
self.total_valid_text_token_count = None
482506

483507
def __str__(self):
484508
"""Return a human-readable string representation of the translation result."""
@@ -524,6 +548,20 @@ def __str__(self):
524548
if hasattr(self, "peak_memory_usage") and self.peak_memory_usage:
525549
result.append(f"\tPeak memory usage: {self.peak_memory_usage} MB")
526550

551+
if hasattr(self, "total_valid_character_count") and isinstance(
552+
self.total_valid_character_count, int
553+
):
554+
result.append(
555+
f"\tTotal valid character count: {self.total_valid_character_count}"
556+
)
557+
558+
if hasattr(self, "total_valid_text_token_count") and isinstance(
559+
self.total_valid_text_token_count, int
560+
):
561+
result.append(
562+
f"\tTotal valid text token count (gpt-4o): {self.total_valid_text_token_count}"
563+
)
564+
527565
if result:
528566
result.insert(0, "Translation results:")
529567

babeldoc/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from babeldoc.translator.translator import set_translate_rate_limiter
2727

2828
logger = logging.getLogger(__name__)
29-
__version__ = "0.5.16"
29+
__version__ = "0.5.17"
3030

3131

3232
def create_parser():

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "BabelDOC"
3-
version = "0.5.16"
3+
version = "0.5.17"
44
description = "Yet Another Document Translator"
55
license = "AGPL-3.0"
66
readme = "README.md"
@@ -162,7 +162,7 @@ pythonpath = [".", "src"]
162162
testpaths = ["tests"]
163163

164164
[bumpver]
165-
current_version = "0.5.16"
165+
current_version = "0.5.17"
166166
version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]"
167167

168168
[bumpver.file_patterns]

0 commit comments

Comments
 (0)