Skip to content

Commit b4a27aa

Browse files
committed
prompt fix
1 parent 029614d commit b4a27aa

File tree

2 files changed

+39
-122
lines changed

2 files changed

+39
-122
lines changed

scripts/translate_eastern.py

Lines changed: 21 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -27,40 +27,24 @@
2727
with open(README_PATH, "r", encoding="utf-8") as f:
2828
original_text = f.read()
2929

30-
# --- PRE-PROCESSING ---
31-
protected_blocks = []
32-
33-
def protect_match(match):
34-
placeholder = f"__PB_{len(protected_blocks)}__"
35-
protected_blocks.append(match.group(0))
36-
return placeholder
37-
3830
text_to_translate = original_text
3931

40-
# 1. Protect Navigation Bar (Whole Block)
41-
text_to_translate = re.sub(r'(<div\s+align=["\']center["\']\s*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
42-
# 2. Protect Logo (Whole Block)
43-
text_to_translate = re.sub(r'(<div\s+style=["\'][^"]*text-align:center[^"]*["\']\s*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
44-
# 3. Protect Markdown Images (Badges, Screenshots)
45-
text_to_translate = re.sub(r'(!\[[^\]\r\n]*\]\([^)\r\n]+\))', protect_match, text_to_translate)
46-
# 4. Protect HTML Tags (Preserves Gallery structure <details>, <summary>, <img> but exposes text)
47-
text_to_translate = re.sub(r'(<[^>]+>)', protect_match, text_to_translate)
48-
4932
# Specialized Prompt for CJK/Eastern Languages
5033
prompt = f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
51-
You are a professional technical {target_lang_name} translator. Translate the provided GitHub README into {target_lang_name}.
34+
You are a professional technical {target_lang_name} translator.
35+
Translate the provided GitHub README into {target_lang_name}.
5236
5337
CRITICAL RULES:
54-
1. **Placeholders**: You will see tags like __PB_0__, __PB_1__.
55-
- DO NOT translate them.
56-
- DO NOT remove them.
57-
- DO NOT convert underscores (_) to full-width characters. Keep them as it is.
58-
2. **Formatting**: Preserve all Markdown structure exactly.
59-
3. **Terminology**: Keep English technical terms (GPU, CLI, VRAM, Docker, CUDA) in English.
60-
4. **Context**:
38+
1. **Navigation Bar**: The top HTML block (`<div align="center">...</div>`) containing language links MUST remain EXACTLY the same. Do NOT translate the filenames or language names inside it. Do not change anything in it.
39+
2. **Badges**: Do NOT translate the text inside badge links (e.g., `![License]`, `![Python]`). Keep the URLs exactly as is.
40+
3. **Logo**: Keep the logo HTML block (`<div style="text-align:center...`) exactly as is.
41+
4. **Formatting**: Preserve all emojis and HTML/Markdown tags exactly.
42+
5. **Terminology**: Keep English technical terms (GPU, CLI, VRAM, Docker, CUDA) in English.
43+
6. **Context**:
6144
- 'Enforcement' = Policy restriction (e.g., JA: 制限/強制).
62-
- 'Headless' = Server without display.
63-
5. **Output**: ONLY the translated text. No explanations.
45+
- 'Headless' = Server without GUI/display.
46+
- 'Agnostic' = Hardware Independence (JA: 非依存, ZH: 无关性).
47+
7. **Output**: ONLY the translated text. No explanations.
6448
<|END_OF_TURN_TOKEN|>
6549
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
6650
{text_to_translate}<|END_OF_TURN_TOKEN|>
@@ -69,46 +53,20 @@ def protect_match(match):
6953
response = llm(prompt, max_tokens=6144, temperature=0, stop=["<|END_OF_TURN_TOKEN|>"])
7054
translated_content = response['choices'][0]['text'].strip()
7155

72-
# --- POST-PROCESSING: Chain Restoration ---
73-
74-
for i, block in enumerate(protected_blocks):
75-
placeholder = f"__PB_{i}__"
76-
77-
# 1. Direct replacement
78-
if placeholder in translated_content:
79-
translated_content = translated_content.replace(placeholder, block)
80-
continue
81-
82-
# 2. Loose Regex Fallback (Handles CJK full-width issues like _PB_0_)
83-
# Matches __PB_0__, _PB_0_, [PB_0], etc.
84-
loose_pattern = re.compile(rf"[\[[]?\s*[___]+\s*PB_{i}\s*[___]+\s*[\]]]?", re.IGNORECASE)
85-
if loose_pattern.search(translated_content):
86-
translated_content = loose_pattern.sub(lambda m: block, translated_content)
87-
continue
88-
89-
# 3. CRITICAL FALLBACK: Chain Insertion
90-
if i == 0:
91-
translated_content = block + "\n\n" + translated_content
92-
else:
93-
prev_block = protected_blocks[i-1]
94-
if prev_block in translated_content:
95-
# Insert current block immediately after the previous one
96-
translated_content = translated_content.replace(prev_block, prev_block + "\n" + block, 1)
97-
else:
98-
translated_content = block + "\n\n" + translated_content
56+
# --- POST-PROCESSING ---
57+
# 1. CLEANUP: Remove markdown code fences if the LLM included them
58+
if translated_content.startswith("```"):
59+
lines = translated_content.splitlines()
60+
if lines[0].startswith("```"):
61+
lines = lines[1:]
62+
if lines and lines[-1].strip().startswith("```"):
63+
lines = lines[:-1]
64+
translated_content = "\n".join(lines).strip()
9965

100-
# 4. Path Correction
66+
# 2. Path Correction
10167
# Prepend ../ to relative paths
10268
translated_content = re.sub(r'(\[.*?\]\()(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
10369
translated_content = re.sub(r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
10470

105-
# 5. Cleanup
106-
translated_content = re.sub(r'^<!--\s*|(?:\s*)?-->$', '', translated_content).strip()
107-
if translated_content.startswith("```"):
108-
lines = translated_content.splitlines()
109-
if lines[0].startswith("```"): lines = lines[1:]
110-
if lines and lines[-1].strip().startswith("```"): lines = lines[:-1]
111-
translated_content = "\n".join(lines).strip()
112-
11371
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
11472
f.write(translated_content)

scripts/translate_western.py

Lines changed: 18 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -27,33 +27,20 @@
2727
with open(README_PATH, "r", encoding="utf-8") as f:
2828
original_text = f.read()
2929

30-
# --- PRE-PROCESSING: Protect Sensitive Blocks ---
31-
protected_blocks = []
32-
33-
def protect_match(match):
34-
# Use underscores to look like code variables, which LLMs respect more
35-
placeholder = f"__PB_{len(protected_blocks)}__"
36-
protected_blocks.append(match.group(0))
37-
return placeholder
30+
# --- PRE-PROCESSING---
3831

3932
text_to_translate = original_text
4033

41-
# 1. Protect Navigation Bar (Whole Block)
42-
text_to_translate = re.sub(r'(<div\s+align=["\']center["\']\s*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
43-
# 2. Protect Logo (Whole Block)
44-
text_to_translate = re.sub(r'(<div\s+style=["\'][^"]*text-align:center[^"]*["\']\s*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
45-
# 3. Protect Markdown Images (Badges, Screenshots)
46-
text_to_translate = re.sub(r'(!\[[^\]\r\n]*\]\([^)\r\n]+\))', protect_match, text_to_translate)
47-
# 4. Protect HTML Tags (Preserves Gallery structure <details>, <summary>, <img> but exposes text)
48-
text_to_translate = re.sub(r'(<[^>]+>)', protect_match, text_to_translate)
49-
5034
prompt = f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
5135
You are a professional technical {target_lang_name} translator. Translate the provided README into professional developer-level {target_lang_name}.
36+
5237
CRITICAL RULES:
53-
1. **Structure**: Keep the layout exactly the same.
54-
2. **Placeholders**: You will see placeholders like __PB_0__, __PB_1__. These are images or layout blocks. KEEP THEM EXACTLY AS IS. Do not move or translate them.
55-
3. **Terminology**: Preserve terms like GPU, CLI, VRAM, SSH, Docker, API, CUDA.
56-
4. **No Talk**: Output ONLY the translated text.
38+
1. **Navigation Bar**: The top HTML block (`<div align="center">...</div>`) containing language links MUST remain EXACTLY the same. Do NOT translate the filenames or language names inside it.
39+
2. **Badges**: Do NOT translate the text inside badge links (e.g., `![License]`, `![Python]`). Keep the URLs exactly as is.
40+
3. **Logo**: Keep the logo HTML block (`<div style="text-align:center...`) exactly as is.
41+
4. **Formatting**: Preserve all emojis and HTML/Markdown tags exactly.
42+
5. **Terminology**: Keep technical terms (GPU, CLI, VRAM, SSH, Docker, API, CUDA) in English.
43+
6. **No Talk**: Output ONLY the translated text. Do not wrap the output in markdown code fences (```).
5744
<|END_OF_TURN_TOKEN|>
5845
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
5946
{text_to_translate}<|END_OF_TURN_TOKEN|>
@@ -62,49 +49,21 @@ def protect_match(match):
6249
response = llm(prompt, max_tokens=6144, temperature=0, stop=["<|END_OF_TURN_TOKEN|>"])
6350
translated_content = response['choices'][0]['text'].strip()
6451

65-
# --- POST-PROCESSING: Chain Restoration ---
66-
67-
for i, block in enumerate(protected_blocks):
68-
placeholder = f"__PB_{i}__"
69-
70-
# 1. Direct replacement (Best case)
71-
if placeholder in translated_content:
72-
translated_content = translated_content.replace(placeholder, block)
73-
continue
74-
75-
# 2. Loose Regex Fallback (Handles spacing issues)
76-
loose_pattern = re.compile(rf"\[?\s*__\s*PB_{i}\s*__\s*\]?", re.IGNORECASE)
77-
if loose_pattern.search(translated_content):
78-
translated_content = loose_pattern.sub(lambda m: block, translated_content)
79-
continue
52+
# --- POST-PROCESSING ---
53+
# 1. CLEANUP: Remove markdown code fences if the LLM included them
54+
if translated_content.startswith("```"):
55+
lines = translated_content.splitlines()
56+
if lines[0].startswith("```"):
57+
lines = lines[1:]
58+
if lines and lines[-1].strip().startswith("```"):
59+
lines = lines[:-1]
60+
translated_content = "\n".join(lines).strip()
8061

81-
# 3. CRITICAL FALLBACK: Chain Insertion
82-
# If a block is missing, insert it immediately after the previous block.
83-
# This ensures Nav -> Logo -> Badge1 -> Badge2 order is preserved even if the LLM drops them.
84-
85-
if i == 0:
86-
# Nav missing? Prepend to file.
87-
translated_content = block + "\n\n" + translated_content
88-
else:
89-
# Insert after the previous block (which is guaranteed to be in the text now)
90-
prev_block = protected_blocks[i-1]
91-
if prev_block in translated_content:
92-
# Replace the previous block with "Previous + New"
93-
# We use a specific check to avoid duplicating if the previous block appears multiple times (unlikely for these headers)
94-
translated_content = translated_content.replace(prev_block, prev_block + "\n" + block, 1)
95-
else:
96-
# If previous block is somehow missing (shouldn't happen due to loop order), just prepend
97-
translated_content = block + "\n\n" + translated_content
9862

99-
# 4. Path Correction
63+
# 2. Path Correction
10064
# Prepend ../ to relative paths
10165
translated_content = re.sub(r'(\[.*?\]\()(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
10266
translated_content = re.sub(r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
10367

104-
# 5. Cleanup
105-
translated_content = re.sub(r'^<!--\s*|(?:\s*)?-->$', '', translated_content).strip()
106-
if translated_content.startswith("```"):
107-
translated_content = "\n".join(translated_content.splitlines()[1:-1]).strip()
108-
10968
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
11069
f.write(translated_content)

0 commit comments

Comments
 (0)