Skip to content

Commit 910c8ed

Browse files
committed
split lang
1 parent 75c3920 commit 910c8ed

File tree

4 files changed

+232
-3
lines changed

4 files changed

+232
-3
lines changed

.github/workflows/translate.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
wget -O models/aya-expanse-8b-q4_k_s.gguf https://huggingface.co/matrixportalx/aya-expanse-8b-Q4_K_S-GGUF/resolve/main/aya-expanse-8b-q4_k_s.gguf
4949
5050
- name: Run Translation Script
51-
run: python scripts/translate.py --lang ${{ matrix.lang }}
51+
run: python scripts/translate_western.py --lang ${{ matrix.lang }}
5252

5353
- name: Upload Translation Artifact
5454
uses: actions/upload-artifact@v4
@@ -120,7 +120,7 @@ jobs:
120120
wget -O models/aya-expanse-8b-q4_k_s.gguf https://huggingface.co/matrixportalx/aya-expanse-8b-Q4_K_S-GGUF/resolve/main/aya-expanse-8b-q4_k_s.gguf
121121
122122
- name: Run Translation Script
123-
run: python scripts/translate.py --lang ${{ matrix.lang }}
123+
run: python scripts/translate_eastern.py --lang ${{ matrix.lang }}
124124

125125
- name: Upload Translation Artifact
126126
uses: actions/upload-artifact@v4

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
<a href="locales/README.ja.md">🇯🇵 日本語</a> |
77
<a href="locales/README.zh.md">🇨🇳 中文</a> |
88
<a href="locales/README.pt.md">🇵🇹 Português</a> |
9-
<a href="locales/README.ko.md">🇰🇷 한국어</a> |
9+
<a href="locales/README.ko.md">🇰🇷 한국어</a> |
1010
<a href="README.hi.md">🇮🇳 Hindi</a>
1111
</div>
1212

scripts/translate_eastern.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import os
2+
import re
3+
import argparse
4+
from llama_cpp import Llama
5+
6+
LANG_MAP = {
7+
"ja": "Japanese",
8+
"zh": "Chinese(Simplified)",
9+
"ko": "Korean",
10+
"hi": "Hindi",
11+
}
12+
13+
parser = argparse.ArgumentParser()
14+
parser.add_argument("--lang", type=str, required=True)
15+
args = parser.parse_args()
16+
target_lang_name = LANG_MAP.get(args.lang, "English")
17+
18+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19+
README_PATH = os.path.join(BASE_DIR, "README.md")
20+
OUTPUT_DIR = os.path.join(BASE_DIR, "locales")
21+
OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"README.{args.lang}.md")
22+
MODEL_PATH = os.path.join(BASE_DIR, "models", "aya-expanse-8b-q4_k_s.gguf")
23+
24+
os.makedirs(OUTPUT_DIR, exist_ok=True)
25+
llm = Llama(model_path=MODEL_PATH, n_ctx=6144, n_threads=2, verbose=False)
26+
27+
with open(README_PATH, "r", encoding="utf-8") as f:
28+
original_text = f.read()
29+
30+
# --- PRE-PROCESSING ---
31+
protected_blocks = []
32+
33+
def protect_match(match):
34+
placeholder = f"__PB_{len(protected_blocks)}__"
35+
protected_blocks.append(match.group(0))
36+
return placeholder
37+
38+
text_to_translate = original_text
39+
40+
# 1. Protect Navigation Bar
41+
text_to_translate = re.sub(r'(<div\s+[^>]*align=["\']center["\'][^>]*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
42+
# 2. Protect Logo Block
43+
text_to_translate = re.sub(r'(<div\s+[^>]*style=["\'][^"\']*text-align:\s*center[^"\']*["\'][^>]*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
44+
# 3. Protect ALL Images (Badges + Gallery)
45+
text_to_translate = re.sub(r'(!\[[^\]\r\n]*\]\([^)\r\n]+\))', protect_match, text_to_translate)
46+
47+
# Specialized Prompt for CJK/Eastern Languages
48+
prompt = f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
49+
You are a professional technical {target_lang_name} translator. Translate the provided GitHub README into {target_lang_name}.
50+
51+
CRITICAL RULES:
52+
1. **Placeholders**: You will see tags like __PB_0__, __PB_1__.
53+
- DO NOT translate them.
54+
- DO NOT remove them.
55+
- DO NOT convert underscores (_) to full-width characters. Keep them as it is.
56+
2. **Formatting**: Preserve all Markdown structure exactly.
57+
3. **Terminology**: Keep English technical terms (GPU, CLI, VRAM, Docker, CUDA) in English.
58+
4. **Context**:
59+
- 'Enforcement' = Policy restriction (e.g., JA: 制限/強制).
60+
- 'Headless' = Server without display.
61+
5. **Output**: ONLY the translated text. No explanations.
62+
<|END_OF_TURN_TOKEN|>
63+
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
64+
{text_to_translate}<|END_OF_TURN_TOKEN|>
65+
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"""
66+
67+
response = llm(prompt, max_tokens=6144, temperature=0, stop=["<|END_OF_TURN_TOKEN|>"])
68+
translated_content = response['choices'][0]['text'].strip()
69+
70+
# --- POST-PROCESSING: Chain Restoration ---
71+
72+
for i, block in enumerate(protected_blocks):
73+
placeholder = f"__PB_{i}__"
74+
75+
# 1. Direct replacement
76+
if placeholder in translated_content:
77+
translated_content = translated_content.replace(placeholder, block)
78+
continue
79+
80+
# 2. Loose Regex Fallback (Handles CJK full-width issues like _PB_0_)
81+
# Matches __PB_0__, _PB_0_, [PB_0], etc.
82+
loose_pattern = re.compile(rf"[\[[]?\s*[___]+\s*PB_{i}\s*[___]+\s*[\]]]?", re.IGNORECASE)
83+
if loose_pattern.search(translated_content):
84+
translated_content = loose_pattern.sub(lambda m: block, translated_content)
85+
continue
86+
87+
# 3. CRITICAL FALLBACK: Chain Insertion
88+
if i == 0:
89+
translated_content = block + "\n\n" + translated_content
90+
else:
91+
prev_block = protected_blocks[i-1]
92+
if prev_block in translated_content:
93+
# Insert current block immediately after the previous one
94+
translated_content = translated_content.replace(prev_block, prev_block + "\n" + block, 1)
95+
else:
96+
translated_content = block + "\n\n" + translated_content
97+
98+
# 4. Path Correction
99+
# Remove 'locales/' hallucination
100+
translated_content = re.sub(r'(\[.*?\]\()locales/', r'\1', translated_content)
101+
translated_content = re.sub(r'((?:src|href)=["\'])locales/', r'\1', translated_content)
102+
103+
# Prepend ../ to relative paths
104+
translated_content = re.sub(r'(\[.*?\]\()(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
105+
translated_content = re.sub(r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
106+
107+
# 5. Cleanup
108+
translated_content = re.sub(r'^<!--\s*|(?:\s*)?-->$', '', translated_content).strip()
109+
if translated_content.startswith("```"):
110+
lines = translated_content.splitlines()
111+
if lines[0].startswith("```"): lines = lines[1:]
112+
if lines and lines[-1].strip().startswith("```"): lines = lines[:-1]
113+
translated_content = "\n".join(lines).strip()
114+
115+
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
116+
f.write(translated_content)

scripts/translate_western.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import os
2+
import re
3+
import argparse
4+
from llama_cpp import Llama
5+
6+
LANG_MAP = {
7+
"de": "German",
8+
"fr": "French",
9+
"es": "Spanish",
10+
"pt": "Portuguese",
11+
}
12+
13+
parser = argparse.ArgumentParser()
14+
parser.add_argument("--lang", type=str, required=True)
15+
args = parser.parse_args()
16+
target_lang_name = LANG_MAP.get(args.lang, "English")
17+
18+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19+
README_PATH = os.path.join(BASE_DIR, "README.md")
20+
OUTPUT_DIR = os.path.join(BASE_DIR, "locales")
21+
OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"README.{args.lang}.md")
22+
MODEL_PATH = os.path.join(BASE_DIR, "models", "aya-expanse-8b-q4_k_s.gguf")
23+
24+
os.makedirs(OUTPUT_DIR, exist_ok=True)
25+
llm = Llama(model_path=MODEL_PATH, n_ctx=6144, n_threads=2, verbose=False)
26+
27+
with open(README_PATH, "r", encoding="utf-8") as f:
28+
original_text = f.read()
29+
30+
# --- PRE-PROCESSING: Protect Sensitive Blocks ---
31+
protected_blocks = []
32+
33+
def protect_match(match):
34+
# Use underscores to look like code variables, which LLMs respect more
35+
placeholder = f"__PB_{len(protected_blocks)}__"
36+
protected_blocks.append(match.group(0))
37+
return placeholder
38+
39+
text_to_translate = original_text
40+
41+
# 1. Protect Navigation Bar
42+
text_to_translate = re.sub(r'(<div\s+[^>]*align=["\']center["\'][^>]*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
43+
# 2. Protect Logo Block
44+
text_to_translate = re.sub(r'(<div\s+[^>]*style=["\'][^"\']*text-align:\s*center[^"\']*["\'][^>]*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
45+
# 3. Protect ALL Images (Badges + Gallery)
46+
# This prevents the gallery from disappearing or badges being translated
47+
text_to_translate = re.sub(r'(!\[[^\]\r\n]*\]\([^)\r\n]+\))', protect_match, text_to_translate)
48+
49+
prompt = f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
50+
You are a professional technical translator. Translate the provided README into professional developer-level {target_lang_name}.
51+
CRITICAL RULES:
52+
1. **Structure**: Keep the layout exactly the same.
53+
2. **Placeholders**: You will see placeholders like __PB_0__, __PB_1__. These are images or layout blocks. KEEP THEM EXACTLY AS IS. Do not move or translate them.
54+
3. **Terminology**: Preserve terms like GPU, CLI, VRAM, SSH, Docker, API, CUDA.
55+
4. **No Talk**: Output ONLY the translated text.
56+
<|END_OF_TURN_TOKEN|>
57+
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
58+
{text_to_translate}<|END_OF_TURN_TOKEN|>
59+
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"""
60+
61+
response = llm(prompt, max_tokens=6144, temperature=0, stop=["<|END_OF_TURN_TOKEN|>"])
62+
translated_content = response['choices'][0]['text'].strip()
63+
64+
# --- POST-PROCESSING: Chain Restoration ---
65+
66+
for i, block in enumerate(protected_blocks):
67+
placeholder = f"__PB_{i}__"
68+
69+
# 1. Direct replacement (Best case)
70+
if placeholder in translated_content:
71+
translated_content = translated_content.replace(placeholder, block)
72+
continue
73+
74+
# 2. Loose Regex Fallback (Handles spacing issues)
75+
loose_pattern = re.compile(rf"\[?\s*__\s*PB_{i}\s*__\s*\]?", re.IGNORECASE)
76+
if loose_pattern.search(translated_content):
77+
translated_content = loose_pattern.sub(lambda m: block, translated_content)
78+
continue
79+
80+
# 3. CRITICAL FALLBACK: Chain Insertion
81+
# If a block is missing, insert it immediately after the previous block.
82+
# This ensures Nav -> Logo -> Badge1 -> Badge2 order is preserved even if the LLM drops them.
83+
84+
if i == 0:
85+
# Nav missing? Prepend to file.
86+
translated_content = block + "\n\n" + translated_content
87+
else:
88+
# Insert after the previous block (which is guaranteed to be in the text now)
89+
prev_block = protected_blocks[i-1]
90+
if prev_block in translated_content:
91+
# Replace the previous block with "Previous + New"
92+
# We use a specific check to avoid duplicating if the previous block appears multiple times (unlikely for these headers)
93+
translated_content = translated_content.replace(prev_block, prev_block + "\n" + block, 1)
94+
else:
95+
# If previous block is somehow missing (shouldn't happen due to loop order), just prepend
96+
translated_content = block + "\n\n" + translated_content
97+
98+
# 4. Path Correction
99+
# Remove 'locales/' hallucination
100+
translated_content = re.sub(r'(\[.*?\]\()locales/', r'\1', translated_content)
101+
translated_content = re.sub(r'((?:src|href)=["\'])locales/', r'\1', translated_content)
102+
103+
# Prepend ../ to relative paths
104+
translated_content = re.sub(r'(\[.*?\]\()(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
105+
translated_content = re.sub(r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
106+
107+
# 5. Cleanup
108+
translated_content = re.sub(r'^<!--\s*|(?:\s*)?-->$', '', translated_content).strip()
109+
if translated_content.startswith("```"):
110+
translated_content = "\n".join(translated_content.splitlines()[1:-1]).strip()
111+
112+
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
113+
f.write(translated_content)

0 commit comments

Comments
 (0)