Skip to content

Commit 258a92b

Browse files
authored
Merge pull request #555 from qazxcdswe123/disable-same-input-output-fallback
Add a (off-by-default) option to disable same input output fallback, update logging format
2 parents 6153534 + 43ceda3 commit 258a92b

File tree

5 files changed

+30
-11
lines changed

5 files changed

+30
-11
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ uv run babeldoc --files example.pdf --files example2.pdf --openai --openai-model
195195
- `--openai`: Use OpenAI for translation (default: False)
196196
- `--custom-system-prompt`: Custom system prompt for translation.
197197
- `--add-formula-placehold-hint`: Add formula placeholder hint for translation. (Currently not recommended, it may affect translation quality, default: False)
198+
- `--disable-same-text-fallback`: Disable fallback translation when LLM output matches input text. (default: False)
198199
- `--pool-max-workers`: Maximum number of worker threads for internal task processing pools. If not specified, defaults to QPS value. This parameter directly sets the worker count, replacing previous QPS-based dynamic calculations.
199200
- `--no-auto-extract-glossary`: Disable automatic term extraction. If this flag is present, the step is skipped. Defaults to enabled.
200201

@@ -305,6 +306,7 @@ openai-model = "gpt-4o-mini"
305306
openai-base-url = "https://api.openai.com/v1"
306307
openai-api-key = "your-api-key-here"
307308
enable-json-mode-if-requested = false # Enable JSON mode when requested (default: false)
309+
disable_same_text_fallback = false # Disable fallback translation when LLM output matches input text (default: false)
308310
pool-max-workers = 8 # Maximum worker threads for task processing (defaults to QPS value if not set)
309311

310312
# Glossary Options (Optional)

babeldoc/format/pdf/document_il/midend/il_translator_llm_only.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -761,7 +761,12 @@ def translate_paragraph(
761761
input_token_count = self.calc_token_count(trimed_input)
762762
output_token_count = self.calc_token_count(output_unicode)
763763

764-
if trimed_input == output_unicode and input_token_count > 10:
764+
same_as_input = trimed_input == output_unicode
765+
if (
766+
same_as_input
767+
and input_token_count > 10
768+
and not self.translation_config.disable_same_text_fallback
769+
):
765770
llm_translate_tracker.set_error_message(
766771
"Translation result is the same as input, fallback."
767772
)
@@ -781,16 +786,19 @@ def translate_paragraph(
781786
llm_translate_tracker.set_placeholder_full_match()
782787
continue
783788

784-
edit_distance = Levenshtein.distance(input_unicode, output_unicode)
785-
if edit_distance < 5 and input_token_count > 20:
786-
llm_translate_tracker.set_error_message(
787-
f"Translation result edit distance is too small. distance: {edit_distance}, input: {input_unicode}, output: {output_unicode}"
788-
)
789-
logger.warning(
790-
f"Translation result edit distance is too small. distance: {edit_distance}, input: {input_unicode}, output: {output_unicode}"
789+
if not self.translation_config.disable_same_text_fallback:
790+
edit_distance = Levenshtein.distance(
791+
input_unicode, output_unicode
791792
)
792-
llm_translate_tracker.set_placeholder_full_match()
793-
continue
793+
if edit_distance < 5 and input_token_count > 20:
794+
llm_translate_tracker.set_error_message(
795+
f"Translation result edit distance is too small. distance: {edit_distance}, input: {input_unicode}, output: {output_unicode}"
796+
)
797+
logger.warning(
798+
f"Translation result edit distance is too small. distance: {edit_distance}, input: {input_unicode}, output: {output_unicode}"
799+
)
800+
llm_translate_tracker.set_placeholder_full_match()
801+
continue
794802
# Apply the translation to the paragraph
795803
self.il_translator.post_translate_paragraph(
796804
inputs[id_][2],

babeldoc/format/pdf/translation_config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ def __init__(
199199
term_extraction_translator: BaseTranslator | None = None,
200200
metadata_extra_data: str | None = None,
201201
term_pool_max_workers: int | None = None,
202+
disable_same_text_fallback: bool = False,
202203
):
203204
self.translator = translator
204205
self.term_extraction_translator = term_extraction_translator or translator
@@ -354,6 +355,7 @@ def __init__(
354355
"completion_tokens": 0,
355356
"cache_hit_prompt_tokens": 0,
356357
}
358+
self.disable_same_text_fallback = disable_same_text_fallback
357359

358360
if self.ocr_workaround:
359361
self.remove_non_formula_lines = False

babeldoc/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,12 @@ def create_parser():
269269
default=False,
270270
help="Add formula placeholder hint for translation. (Currently not recommended, it may affect translation quality, default: False)",
271271
)
272+
translation_group.add_argument(
273+
"--disable-same-text-fallback",
274+
action="store_true",
275+
default=False,
276+
help="Disable fallback translation when LLM output matches input text. (default: False)",
277+
)
272278
translation_group.add_argument(
273279
"--glossary-files",
274280
type=str,
@@ -703,6 +709,7 @@ async def main():
703709
custom_system_prompt=args.custom_system_prompt,
704710
working_dir=working_dir,
705711
add_formula_placehold_hint=args.add_formula_placehold_hint,
712+
disable_same_text_fallback=args.disable_same_text_fallback,
706713
glossaries=loaded_glossaries,
707714
pool_max_workers=args.pool_max_workers,
708715
auto_extract_glossary=args.auto_extract_glossary,

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)