Skip to content

Commit eab9d06

Browse files
changdazhouTingquanGao
authored andcommitted
support glossary for translation
1 parent 203cc1f commit eab9d06

File tree

5 files changed

+124
-22
lines changed

5 files changed

+124
-22
lines changed

docs/pipeline_usage/tutorials/ocr_pipelines/PP-DocTranslation.en.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1386,13 +1386,25 @@ After executing the above code, you will obtain the parsed results of the origin
13861386
<td><code>str|None</code></td>
13871387
<td>
13881388
<ul>
1389-
<li><b>str</b>: Example data in key-value pair format, which can include a terminology对照表 (glossary)</li>
1389+
<li><b>str</b>: Example data in key-value pair format</li>
13901390
<li><b>None</b>: Do not provide structured examples</li>
13911391
</ul>
13921392
</td>
13931393
<td><code>None</code></td>
13941394
</tr>
13951395
<tr>
1396+
<td><code>glossary</code></td>
1397+
<td>Glossary of technical terms</td>
1398+
<td><code>dict|None</code></td>
1399+
<td>
1400+
<ul>
1401+
<li><b>dict</b>: Dictionary for glossary mapping</li>
1402+
<li><b>None</b>: Use default configuration</li>
1403+
</ul>
1404+
</td>
1405+
<td><code>None</code></td>
1406+
</tr>
1407+
<tr>
13961408
<td><code>llm_request_interval</code></td>
13971409
<td>Time interval in seconds for sending requests to the large language model. This parameter can be used to prevent overly frequent calls to the large language model.</td>
13981410
<td><code>float</code></td>

docs/pipeline_usage/tutorials/ocr_pipelines/PP-DocTranslation.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1448,6 +1448,18 @@ for tgt_md_info in tgt_md_info_list:
14481448
<td><code>None</code></td>
14491449
</tr>
14501450
<tr>
1451+
<td><code>glossary</code></td>
1452+
<td>专业术语对照表</td>
1453+
<td><code>dict|None</code></td>
1454+
<td>
1455+
<ul>
1456+
<li><b>dict</b>:词表映射字典</li>
1457+
<li><b>None</b>:使用默认配置</li>
1458+
</ul>
1459+
</td>
1460+
<td><code>None</code></td>
1461+
</tr>
1462+
<tr>
14511463
<td><code>llm_request_interval</code></td>
14521464
<td>向大语言模型发送请求的时间间隔,单位为秒。该参数可用于防止过于频繁地调用大语言模型。</td>
14531465
<td><code>float</code></td>

paddlex/inference/pipelines/components/prompt_engineering/generate_translate_prompt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def generate_prompt(
163163
few_shot_demo_key_value_list = self.few_shot_demo_key_value_list
164164

165165
if few_shot_demo_key_value_list:
166-
few_shot_demo_key_value_list = f"这里是一些专业术语对照表,对照表中单词要参考对照表翻译\n{few_shot_demo_key_value_list}\n"
166+
few_shot_demo_key_value_list = f"\n这里是一些专业术语对照表,如果遇到对照表中单词要参考对照表翻译\n{few_shot_demo_key_value_list}\n"
167167

168168
prompt = f"""{task_description}{rules_str}{output_format}{few_shot_demo_text_content}{few_shot_demo_key_value_list}"""
169169

paddlex/inference/pipelines/pp_doctranslation/pipeline.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,8 @@ def chunk_translate(self, md_blocks, chunk_size, translate_func):
326326
chunk = "" # Clear the chunk
327327

328328
if block_type == "text":
329-
split_text_recursive(
330-
block_content, chunk_size, translate_func, translation_results
329+
translation_results.append(
330+
split_text_recursive(block_content, chunk_size, translate_func)
331331
)
332332
elif block_type == "text_with_html" or block_type == "html":
333333
translate_html_block(
@@ -350,6 +350,7 @@ def translate(
350350
rules_str: str = None,
351351
few_shot_demo_text_content: str = None,
352352
few_shot_demo_key_value_list: str = None,
353+
glossary: Dict = None,
353354
llm_request_interval: float = 0.0,
354355
chat_bot_config: Dict = None,
355356
**kwargs,
@@ -366,6 +367,7 @@ def translate(
366367
rules_str (str, optional): Rules or guidelines for the translation model to follow. Defaults to None.
367368
few_shot_demo_text_content (str, optional): Demo text content for the translation model. Defaults to None.
368369
few_shot_demo_key_value_list (str, optional): Demo text key-value list for the translation model. Defaults to None.
370+
glossary (Dict, optional): A dictionary containing terms and their corresponding definitions. Defaults to None.
369371
llm_request_interval (float, optional): The interval in seconds between each request to the LLM. Defaults to 0.0.
370372
chat_bot_config (Dict, optional): Configuration for the chat bot used in the translation process. Defaults to None.
371373
**kwargs: Additional keyword arguments passed to the translation model.
@@ -396,6 +398,22 @@ def translate(
396398
if not isinstance(llm_request_interval, float):
397399
llm_request_interval = float(llm_request_interval)
398400

401+
assert isinstance(glossary, dict) or glossary is None, "glossary must be a dict"
402+
403+
glossary_str = ""
404+
if glossary is not None:
405+
for k, v in glossary.items():
406+
if isinstance(v, list):
407+
v = "或".join(v)
408+
glossary_str += f"{k}: {v}\n"
409+
410+
if glossary_str != "":
411+
if few_shot_demo_key_value_list is None:
412+
few_shot_demo_key_value_list = glossary_str
413+
else:
414+
few_shot_demo_key_value_list += "\n"
415+
few_shot_demo_key_value_list += glossary_str
416+
399417
def translate_func(text):
400418
"""
401419
Translate the given text using the configured translation model.

paddlex/inference/pipelines/pp_doctranslation/utils.py

Lines changed: 78 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,29 @@
1515
import re
1616

1717

18+
def _is_sentence_dot(text, i):
19+
"""
20+
Check if the given character is a sentence ending punctuation.
21+
"""
22+
# if the character is not a period, return False
23+
if text[i] != ".":
24+
return False
25+
# previous character
26+
prev = text[i - 1] if i > 0 else ""
27+
# next character
28+
next = text[i + 1] if i + 1 < len(text) else ""
29+
# previous is digit or letter, then not sentence ending punctuation
30+
if prev.isdigit() or prev.isalpha():
31+
return False
32+
# next is digit or letter, then not sentence ending punctuation
33+
if next.isdigit() or next.isalpha():
34+
return False
35+
# next is a punctuation, then sentence ending punctuation
36+
if next in ("", " ", "\t", "\n", '"', "'", "”", "’", ")", "】", "」", "》"):
37+
return True
38+
return False
39+
40+
1841
def _find_split_pos(text, chunk_size):
1942
"""
2043
Find the position to split the text into two chunks.
@@ -27,21 +50,44 @@ def _find_split_pos(text, chunk_size):
2750
int: The index where the text should be split.
2851
"""
2952
center = len(text) // 2
53+
split_chars = ["\n", "。", ";", ";", "!", "!", "?", "?"]
54+
3055
# Search forward
3156
for i in range(center, len(text)):
32-
if text[i] in ["\n", ".", "。", ";", ";", "!", "!", "?", "?"]:
33-
if i + 1 < len(text) and len(text[: i + 1]) <= chunk_size:
34-
return i + 1
57+
if text[i] in split_chars:
58+
# Check for whitespace around the split character
59+
j = i + 1
60+
while j < len(text) and text[j] in " \t\n":
61+
j += 1
62+
if j < len(text) and len(text[:j]) <= chunk_size:
63+
return i, j
64+
elif text[i] == "." and _is_sentence_dot(text, i):
65+
j = i + 1
66+
while j < len(text) and text[j] in " \t\n":
67+
j += 1
68+
if j < len(text) and len(text[:j]) <= chunk_size:
69+
return i, j
70+
3571
# Search backward
3672
for i in range(center, 0, -1):
37-
if text[i] in ["\n", ".", "。", ";", ";", "!", "!", "?", "?"]:
38-
if len(text[: i + 1]) <= chunk_size:
39-
return i + 1
73+
if text[i] in split_chars:
74+
j = i + 1
75+
while j < len(text) and text[j] in " \t\n":
76+
j += 1
77+
if len(text[:j]) <= chunk_size:
78+
return i, j
79+
elif text[i] == "." and _is_sentence_dot(text, i):
80+
j = i + 1
81+
while j < len(text) and text[j] in " \t\n":
82+
j += 1
83+
if len(text[:j]) <= chunk_size:
84+
return i, j
85+
4086
# If no suitable position is found, split directly
41-
return min(chunk_size, len(text))
87+
return min(chunk_size, len(text)), min(chunk_size, len(text))
4288

4389

44-
def split_text_recursive(text, chunk_size, translate_func, results):
90+
def split_text_recursive(text, chunk_size, translate_func):
4591
"""
4692
Split the text recursively and translate each chunk.
4793
@@ -56,15 +102,19 @@ def split_text_recursive(text, chunk_size, translate_func, results):
56102
"""
57103
text = text.strip()
58104
if len(text) <= chunk_size:
59-
results.append(translate_func(text))
105+
return translate_func(text)
60106
else:
61-
split_pos = _find_split_pos(text, chunk_size)
62-
left = text[:split_pos].strip()
63-
right = text[split_pos:].strip()
107+
split_pos, end_whitespace = _find_split_pos(text, chunk_size)
108+
left = text[:split_pos]
109+
right = text[end_whitespace:]
110+
whitespace = text[split_pos:end_whitespace]
111+
64112
if left:
65-
split_text_recursive(left, chunk_size, translate_func, results)
113+
left_text = split_text_recursive(left, chunk_size, translate_func)
66114
if right:
67-
split_text_recursive(right, chunk_size, translate_func, results)
115+
right_text = split_text_recursive(right, chunk_size, translate_func)
116+
117+
return left_text + whitespace + right_text
68118

69119

70120
def translate_code_block(code_block, chunk_size, translate_func, results):
@@ -94,15 +144,14 @@ def translate_code_block(code_block, chunk_size, translate_func, results):
94144
footer = ""
95145
code_content = code_block
96146

97-
translated_code_lines = []
98-
split_text_recursive(
99-
code_content, chunk_size, translate_func, translated_code_lines
147+
translated_code_lines = split_text_recursive(
148+
code_content, chunk_size, translate_func
100149
)
101150

102151
# drop ``` or ~~~
103152
filtered_code_lines = [
104153
line
105-
for line in translated_code_lines
154+
for line in translated_code_lines.split("\n")
106155
if not (line.strip().startswith("```") or line.strip().startswith("~~~"))
107156
]
108157
translated_code = "\n".join(filtered_code_lines)
@@ -126,6 +175,17 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
126175
"""
127176
from bs4 import BeautifulSoup
128177

178+
# if this is a short and simple tag, just translate it
179+
if (
180+
html_block.count("<") < 5
181+
and html_block.count(">") < 5
182+
and html_block.count("<") == html_block.count(">")
183+
and len(html_block) < chunk_size
184+
):
185+
translated = translate_func(html_block)
186+
results.append(translated)
187+
return
188+
129189
soup = BeautifulSoup(html_block, "html.parser")
130190

131191
# collect text nodes

0 commit comments

Comments
 (0)