|
3 | 3 | import logging |
4 | 4 | import math |
5 | 5 | import re |
| 6 | +import unicodedata |
6 | 7 | from io import BytesIO |
7 | 8 | from itertools import islice |
8 | 9 | from typing import Literal |
9 | 10 |
|
10 | 11 | import freetype |
11 | 12 | import pymupdf |
| 13 | +import tiktoken |
12 | 14 |
|
13 | 15 | import babeldoc.pdfminer.pdfinterp |
14 | 16 | from babeldoc.format.pdf.babelpdf.base14 import get_base14_bbox |
|
18 | 20 | from babeldoc.format.pdf.babelpdf.utils import guarded_bbox |
19 | 21 | from babeldoc.format.pdf.document_il import il_version_1 |
20 | 22 | from babeldoc.format.pdf.document_il.utils import zstd_helper |
| 23 | +from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper |
21 | 24 | from babeldoc.format.pdf.document_il.utils.matrix_helper import decompose_ctm |
22 | 25 | from babeldoc.format.pdf.document_il.utils.style_helper import BLACK |
23 | 26 | from babeldoc.format.pdf.document_il.utils.style_helper import YELLOW |
@@ -358,6 +361,10 @@ def __init__(self, translation_config: TranslationConfig): |
358 | 361 | self.render_order = 0 |
359 | 362 | self.current_clip_paths: list[tuple] = [] |
360 | 363 | self.clip_paths_stack: list[list[tuple]] = [] |
| 364 | + # For valid character collection |
| 365 | + self.font_mapper = FontMapper(translation_config) |
| 366 | + self.tokenizer = tiktoken.encoding_for_model("gpt-4o") |
| 367 | + self._page_valid_chars_buffer: list[str] | None = None |
361 | 368 |
|
362 | 369 | def transform_clip_path( |
363 | 370 | self, |
@@ -566,8 +573,32 @@ def on_page_start(self): |
566 | 573 | self.current_clip_paths = [] |
567 | 574 | self.clip_paths_stack = [] |
568 | 575 | self.docs.page.append(self.current_page) |
| 576 | + # Prepare per-page buffer for valid characters on translated pages |
| 577 | + self._page_valid_chars_buffer = [] |
569 | 578 |
|
570 | 579 | def on_page_end(self): |
| 580 | + # Accumulate this page's valid characters and tokens into shared context |
| 581 | + try: |
| 582 | + if ( |
| 583 | + self._page_valid_chars_buffer is not None |
| 584 | + and len(self._page_valid_chars_buffer) > 0 |
| 585 | + ): |
| 586 | + page_text = "".join(self._page_valid_chars_buffer) |
| 587 | + char_count = len(page_text) |
| 588 | + try: |
| 589 | + token_count = len( |
| 590 | + self.tokenizer.encode(page_text, disallowed_special=()) |
| 591 | + ) |
| 592 | + except Exception as e: |
| 593 | + logger.warning("Failed to compute token count for page: %s", e) |
| 594 | + token_count = 0 |
| 595 | + self.translation_config.shared_context_cross_split_part.add_valid_counts( |
| 596 | + char_count, token_count |
| 597 | + ) |
| 598 | + except Exception as e: |
| 599 | + logger.warning("Failed to accumulate page valid stats: %s", e) |
| 600 | + finally: |
| 601 | + self._page_valid_chars_buffer = [] |
571 | 602 | self.progress.advance(1) |
572 | 603 |
|
573 | 604 | def on_page_crop_box( |
@@ -848,6 +879,11 @@ def on_lt_char(self, char: LTChar): |
848 | 879 | "Failed to get rotation angle for char %s", |
849 | 880 | char.get_text(), |
850 | 881 | ) |
| 882 | + # Collect valid characters for statistics |
| 883 | + try: |
| 884 | + self._collect_valid_char(char.get_text()) |
| 885 | + except Exception as e: |
| 886 | + logger.warning("Error collecting valid char: %s", e) |
851 | 887 | gs = self.create_graphic_state(char.graphicstate) |
852 | 888 | # Get font from current page or xobject |
853 | 889 | font = None |
@@ -983,6 +1019,43 @@ def on_lt_char(self, char: LTChar): |
983 | 1019 | ) |
984 | 1020 | ) |
985 | 1021 |
|
| 1022 | + def _collect_valid_char(self, ch: str): |
| 1023 | + """Append a valid character into the current page buffer according to rules. |
| 1024 | + Rules: |
| 1025 | + - Include whitespace matched by space_regex directly. |
| 1026 | + - Ignore categories that are never normal text: {Cc, Cs, Co, Cn}. |
| 1027 | + - Apply inverted criteria from formular_helper.py (21-28): |
| 1028 | + empty -> invalid, contains '(cid:' -> invalid, |
| 1029 | + not has_char(ch) -> invalid unless len(ch) > 1 and all(has_char(x)). |
| 1030 | + """ |
| 1031 | + if self._page_valid_chars_buffer is None: |
| 1032 | + return |
| 1033 | + if space_regex.match(ch): |
| 1034 | + self._page_valid_chars_buffer.append(ch) |
| 1035 | + return |
| 1036 | + try: |
| 1037 | + cat = unicodedata.category(ch[0]) if ch else None |
| 1038 | + except Exception: |
| 1039 | + cat = None |
| 1040 | + if cat in {"Cc", "Cs", "Co", "Cn"}: |
| 1041 | + return |
| 1042 | + is_invalid = False |
| 1043 | + if not ch: |
| 1044 | + is_invalid = True |
| 1045 | + elif "(cid:" in ch: |
| 1046 | + is_invalid = True |
| 1047 | + else: |
| 1048 | + try: |
| 1049 | + if not self.font_mapper.has_char(ch): |
| 1050 | + if len(ch) > 1 and all(self.font_mapper.has_char(x) for x in ch): |
| 1051 | + is_invalid = False |
| 1052 | + else: |
| 1053 | + is_invalid = True |
| 1054 | + except Exception: |
| 1055 | + is_invalid = True |
| 1056 | + if not is_invalid: |
| 1057 | + self._page_valid_chars_buffer.append(ch) |
| 1058 | + |
986 | 1059 | def on_lt_curve(self, curve: babeldoc.pdfminer.layout.LTCurve): |
987 | 1060 | if not self.enable_graphic_element_process: |
988 | 1061 | return |
|
0 commit comments