Merge pull request #521 from funstory-ai/dev

awwaawwa · web-flow · commit 1b8b05f59ad6 · 2025-11-12T15:13:02.000+08:00
feat: enhance valid character and token statistics tracking
diff --git a/babeldoc/__init__.py b/babeldoc/__init__.py
@@ -1 +1 @@
-__version__ = "0.5.16"
+__version__ = "0.5.17"
diff --git a/babeldoc/const.py b/babeldoc/const.py
@@ -6,7 +6,7 @@
 import threading
 from pathlib import Path
 
-__version__ = "0.5.16"
+__version__ = "0.5.17"
 
 CACHE_FOLDER = Path.home() / ".cache" / "babeldoc"
 
diff --git a/babeldoc/format/pdf/babelpdf/cidfont.py b/babeldoc/format/pdf/babelpdf/cidfont.py
@@ -41,9 +41,12 @@ def get_descendant_fonts(doc, xref):
 
 
 def get_glyph_bbox(face, g):
-    face.load_glyph(g, freetype.FT_LOAD_NO_SCALE)
-    cbox = face.glyph.outline.get_bbox()
-    return cbox.xMin, cbox.yMin, cbox.xMax, cbox.yMax
+    try:
+        face.load_glyph(g, freetype.FT_LOAD_NO_SCALE)
+        cbox = face.glyph.outline.get_bbox()
+        return cbox.xMin, cbox.yMin, cbox.xMax, cbox.yMax
+    except Exception:
+        return 0, 0, 0, 0
 
 
 def get_face_bbox(blob):
diff --git a/babeldoc/format/pdf/document_il/frontend/il_creater.py b/babeldoc/format/pdf/document_il/frontend/il_creater.py
@@ -3,12 +3,14 @@
 import logging
 import math
 import re
+import unicodedata
 from io import BytesIO
 from itertools import islice
 from typing import Literal
 
 import freetype
 import pymupdf
+import tiktoken
 
 import babeldoc.pdfminer.pdfinterp
 from babeldoc.format.pdf.babelpdf.base14 import get_base14_bbox
@@ -18,6 +20,7 @@
 from babeldoc.format.pdf.babelpdf.utils import guarded_bbox
 from babeldoc.format.pdf.document_il import il_version_1
 from babeldoc.format.pdf.document_il.utils import zstd_helper
+from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
 from babeldoc.format.pdf.document_il.utils.matrix_helper import decompose_ctm
 from babeldoc.format.pdf.document_il.utils.style_helper import BLACK
 from babeldoc.format.pdf.document_il.utils.style_helper import YELLOW
@@ -358,6 +361,10 @@ def __init__(self, translation_config: TranslationConfig):
         self.render_order = 0
         self.current_clip_paths: list[tuple] = []
         self.clip_paths_stack: list[list[tuple]] = []
+        # For valid character collection
+        self.font_mapper = FontMapper(translation_config)
+        self.tokenizer = tiktoken.encoding_for_model("gpt-4o")
+        self._page_valid_chars_buffer: list[str] | None = None
 
     def transform_clip_path(
         self,
@@ -566,8 +573,32 @@ def on_page_start(self):
         self.current_clip_paths = []
         self.clip_paths_stack = []
         self.docs.page.append(self.current_page)
+        # Prepare per-page buffer for valid characters on translated pages
+        self._page_valid_chars_buffer = []
 
     def on_page_end(self):
+        # Accumulate this page's valid characters and tokens into shared context
+        try:
+            if (
+                self._page_valid_chars_buffer is not None
+                and len(self._page_valid_chars_buffer) > 0
+            ):
+                page_text = "".join(self._page_valid_chars_buffer)
+                char_count = len(page_text)
+                try:
+                    token_count = len(
+                        self.tokenizer.encode(page_text, disallowed_special=())
+                    )
+                except Exception as e:
+                    logger.warning("Failed to compute token count for page: %s", e)
+                    token_count = 0
+                self.translation_config.shared_context_cross_split_part.add_valid_counts(
+                    char_count, token_count
+                )
+        except Exception as e:
+            logger.warning("Failed to accumulate page valid stats: %s", e)
+        finally:
+            self._page_valid_chars_buffer = []
         self.progress.advance(1)
 
     def on_page_crop_box(
@@ -848,6 +879,11 @@ def on_lt_char(self, char: LTChar):
                 "Failed to get rotation angle for char %s",
                 char.get_text(),
             )
+        # Collect valid characters for statistics
+        try:
+            self._collect_valid_char(char.get_text())
+        except Exception as e:
+            logger.warning("Error collecting valid char: %s", e)
         gs = self.create_graphic_state(char.graphicstate)
         # Get font from current page or xobject
         font = None
@@ -983,6 +1019,43 @@ def on_lt_char(self, char: LTChar):
                 )
             )
 
+    def _collect_valid_char(self, ch: str):
+        """Append a valid character into the current page buffer according to rules.
+        Rules:
+        - Include whitespace matched by space_regex directly.
+        - Ignore categories that are never normal text: {Cc, Cs, Co, Cn}.
+        - Apply inverted criteria from formular_helper.py (21-28):
+          empty -> invalid, contains '(cid:' -> invalid,
+          not has_char(ch) -> invalid unless len(ch) > 1 and all(has_char(x)).
+        """
+        if self._page_valid_chars_buffer is None:
+            return
+        if space_regex.match(ch):
+            self._page_valid_chars_buffer.append(ch)
+            return
+        try:
+            cat = unicodedata.category(ch[0]) if ch else None
+        except Exception:
+            cat = None
+        if cat in {"Cc", "Cs", "Co", "Cn"}:
+            return
+        is_invalid = False
+        if not ch:
+            is_invalid = True
+        elif "(cid:" in ch:
+            is_invalid = True
+        else:
+            try:
+                if not self.font_mapper.has_char(ch):
+                    if len(ch) > 1 and all(self.font_mapper.has_char(x) for x in ch):
+                        is_invalid = False
+                    else:
+                        is_invalid = True
+            except Exception:
+                is_invalid = True
+        if not is_invalid:
+            self._page_valid_chars_buffer.append(ch)
+
     def on_lt_curve(self, curve: babeldoc.pdfminer.layout.LTCurve):
         if not self.enable_graphic_element_process:
             return
diff --git a/babeldoc/format/pdf/high_level.py b/babeldoc/format/pdf/high_level.py
@@ -755,6 +755,23 @@ def do_translate(
         logger.info(
             f"finish translate: {original_pdf_path}, cost: {finish_time - start_time} s",
         )
+        # Populate aggregate valid text statistics into result
+        try:
+            sc = translation_config.shared_context_cross_split_part
+            result.total_valid_character_count = getattr(
+                sc, "valid_char_count_total", 0
+            )
+            token_total = getattr(sc, "total_valid_text_token_count", None)
+            result.total_valid_text_token_count = (
+                token_total if isinstance(token_total, int) else 0
+            )
+        except Exception as e:
+            logger.warning("Failed to populate valid text statistics: %s", e)
+            try:
+                result.total_valid_character_count = 0
+                result.total_valid_text_token_count = 0
+            except Exception:
+                pass
         result.original_pdf_path = translation_config.input_file
         result.peak_memory_usage = peak_memory_usage
 
diff --git a/babeldoc/format/pdf/translation_config.py b/babeldoc/format/pdf/translation_config.py
@@ -32,6 +32,9 @@ def __init__(self):
         self.auto_extracted_glossary: Glossary | None = None
         self.raw_extracted_terms: list[tuple[str, str]] = []
         self.auto_enabled_ocr_workaround = False
+        # Statistics for valid characters/text across the whole file
+        self.valid_char_count_total: int = 0
+        self.total_valid_text_token_count: int = 0
 
     def initialize_glossaries(self, initial_glossaries: list[Glossary] | None):
         with self._lock:
@@ -45,6 +48,9 @@ def initialize_glossaries(self, initial_glossaries: list[Glossary] | None):
             for g in self.user_glossaries:
                 for entity in g.normalized_lookup:
                     self.norm_terms.add(entity)
+            # reset statistics buffer when initializing
+            self.valid_char_count_total = 0
+            self.total_valid_text_token_count = 0
 
     def add_raw_extracted_term_pair(self, src: str, tgt: str):
         with self._lock:
@@ -67,7 +73,11 @@ def _generate_unique_auto_glossary_name(self) -> str:
         return current_name
 
     def contains_term(self, term: str) -> bool:
-        pass
+        with self._lock:
+            try:
+                return term in self.norm_terms
+            except Exception:
+                return False
 
     def finalize_auto_extracted_glossary(self):
         with self._lock:
@@ -112,6 +122,16 @@ def get_glossaries_for_translation(
                     all_glossaries.append(self.auto_extracted_glossary)
                 return all_glossaries
 
+    def add_valid_counts(self, char_count: int, token_count: int):
+        """Accumulate valid character and token counts in a threadsafe way."""
+        if char_count <= 0 and token_count <= 0:
+            return
+        with self._lock:
+            if char_count > 0:
+                self.valid_char_count_total += char_count
+            if token_count > 0:
+                self.total_valid_text_token_count += token_count
+
 
 class TranslationConfig:
     @staticmethod
@@ -463,6 +483,8 @@ class TranslateResult:
     no_watermark_dual_pdf_path: Path | None
     peak_memory_usage: int | None
     auto_extracted_glossary_path: Path | None
+    total_valid_character_count: int | None
+    total_valid_text_token_count: int | None
 
     def __init__(
         self,
@@ -479,6 +501,8 @@ def __init__(
         self.no_watermark_dual_pdf_path = dual_pdf_path
 
         self.auto_extracted_glossary_path = auto_extracted_glossary_path
+        self.total_valid_character_count = None
+        self.total_valid_text_token_count = None
 
     def __str__(self):
         """Return a human-readable string representation of the translation result."""
@@ -524,6 +548,20 @@ def __str__(self):
         if hasattr(self, "peak_memory_usage") and self.peak_memory_usage:
             result.append(f"\tPeak memory usage: {self.peak_memory_usage} MB")
 
+        if hasattr(self, "total_valid_character_count") and isinstance(
+            self.total_valid_character_count, int
+        ):
+            result.append(
+                f"\tTotal valid character count: {self.total_valid_character_count}"
+            )
+
+        if hasattr(self, "total_valid_text_token_count") and isinstance(
+            self.total_valid_text_token_count, int
+        ):
+            result.append(
+                f"\tTotal valid text token count (gpt-4o): {self.total_valid_text_token_count}"
+            )
+
         if result:
             result.insert(0, "Translation results:")
 
diff --git a/babeldoc/main.py b/babeldoc/main.py
@@ -26,7 +26,7 @@
 from babeldoc.translator.translator import set_translate_rate_limiter
 
 logger = logging.getLogger(__name__)
-__version__ = "0.5.16"
+__version__ = "0.5.17"
 
 
 def create_parser():
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "BabelDOC"
-version = "0.5.16"
+version = "0.5.17"
 description = "Yet Another Document Translator"
 license = "AGPL-3.0"
 readme = "README.md"
@@ -162,7 +162,7 @@ pythonpath = [".", "src"]
 testpaths = ["tests"]
 
 [bumpver]
-current_version = "0.5.16"
+current_version = "0.5.17"
 version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]"
 
 [bumpver.file_patterns]

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.16"`
	`1`	`+__version__ = "0.5.17"`