Skip to content

Commit ecce7a9

Browse files
committed
prompt fix
1 parent 98026e0 commit ecce7a9

File tree

3 files changed

+130
-30
lines changed

3 files changed

+130
-30
lines changed

.github/workflows/translate.yml

Lines changed: 91 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,14 @@ permissions:
1010
contents: write
1111

1212
jobs:
13-
translate:
14-
name: Translate to ${{ matrix.lang }}
13+
# JOB 1: Western Languages (Faster, ~11 mins)
14+
translate-western:
15+
name: Western (${{ matrix.lang }})
1516
runs-on: ubuntu-latest
1617
strategy:
1718
fail-fast: false # If one language fails, let the others finish
1819
matrix:
19-
lang: [de, fr, es, ja, zh, pt, ko] # German, French, Spanish, Japanese, Chinese, Portuguese, Korean
20+
lang: [de, fr, es, pt] # German, French, Spanish, Portuguese
2021
steps:
2122
- name: Checkout Code
2223
uses: actions/checkout@v4
@@ -52,26 +53,103 @@ jobs:
5253
- name: Upload Translation Artifact
5354
uses: actions/upload-artifact@v4
5455
with:
55-
name: readme-${{ matrix.lang }}
56+
name: western-readme-${{ matrix.lang }}
5657
path: locales/README.${{ matrix.lang }}.md
5758

58-
commit-translations:
59-
needs: translate
59+
commit-western:
60+
needs: translate-western
6061
runs-on: ubuntu-latest
6162
steps:
6263
- name: Checkout Code
6364
uses: actions/checkout@v4
6465

65-
- name: Download All Translations
66+
- name: Download Western Translations
6667
uses: actions/download-artifact@v4
6768
with:
68-
pattern: readme-*
69+
pattern: western-readme-*
6970
path: locales
7071
merge-multiple: true
7172

72-
- name: Commit and Push Changes
73-
uses: stefanzweifel/git-auto-commit-action@v5
73+
- name: Commit and Push (Western)
74+
run: |
75+
git config --global user.name "github-actions[bot]"
76+
git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
77+
git add locales/*.md
78+
git commit -m "docs: update western translations (DE, FR, ES, PT)" || echo "No changes to commit"
79+
80+
# Retry logic to handle race conditions if Eastern finishes at the same time
81+
for i in {1..5}; do
82+
git pull --rebase
83+
if git push; then exit 0; fi
84+
echo "Push failed, retrying in 5s..."
85+
sleep 5
86+
done
87+
exit 1
88+
89+
# JOB 2: Eastern/Complex Languages (Slower, ~17 mins)
90+
translate-eastern:
91+
name: Eastern (${{ matrix.lang }})
92+
runs-on: ubuntu-latest
93+
strategy:
94+
fail-fast: false
95+
matrix:
96+
lang: [ja, zh, ko, hi] # Japanese, Chinese, Korean, Hindi
97+
steps:
98+
- name: Checkout Code
99+
uses: actions/checkout@v4
100+
with:
101+
fetch-depth: 0
102+
- name: Set up Python
103+
uses: actions/setup-python@v5
104+
with:
105+
python-version: '3.10'
106+
- name: Cache Model Weights
107+
id: cache-model
108+
uses: actions/cache@v4
109+
with:
110+
path: ./models
111+
key: aya-expanse-8b-q4ks-v4
112+
- name: Install Dependencies
113+
run: |
114+
pip install llama-cpp-python
115+
mkdir -p models
116+
- name: Download Aya Expanse TL Model
117+
if: steps.cache-model.outputs.cache-hit != 'true'
118+
run: |
119+
# Downloading Aya Expanse 8B (Q4_K_S) for multilingual technical translation
120+
wget -O models/aya-expanse-8b-q4_k_s.gguf https://huggingface.co/matrixportalx/aya-expanse-8b-Q4_K_S-GGUF/resolve/main/aya-expanse-8b-q4_k_s.gguf
121+
122+
- name: Run Translation Script
123+
run: python scripts/translate.py --lang ${{ matrix.lang }}
124+
125+
- name: Upload Translation Artifact
126+
uses: actions/upload-artifact@v4
74127
with:
75-
commit_message: "docs: update multilingual translations"
76-
file_pattern: 'locales/*.md'
77-
push_options: '--force-with-lease'
128+
name: eastern-readme-${{ matrix.lang }}
129+
path: locales/README.${{ matrix.lang }}.md
130+
131+
commit-eastern:
132+
needs: translate-eastern
133+
runs-on: ubuntu-latest
134+
steps:
135+
- name: Checkout Code
136+
uses: actions/checkout@v4
137+
- name: Download Eastern Translations
138+
uses: actions/download-artifact@v4
139+
with:
140+
pattern: eastern-readme-*
141+
path: locales
142+
merge-multiple: true
143+
- name: Commit and Push (Eastern)
144+
run: |
145+
git config --global user.name "github-actions[bot]"
146+
git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
147+
git add locales/*.md
148+
git commit -m "docs: update eastern translations (JA, ZH, KO, HI)" || echo "No changes to commit"
149+
for i in {1..5}; do
150+
git pull --rebase
151+
if git push; then exit 0; fi
152+
echo "Push failed, retrying in 5s..."
153+
sleep 5
154+
done
155+
exit 1

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
<a href="locales/README.ja.md">🇯🇵 日本語</a> |
77
<a href="locales/README.zh.md">🇨🇳 中文</a> |
88
<a href="locales/README.pt.md">🇵🇹 Português</a> |
9-
<a href="locales/README.ko.md">🇰🇷 한국어</a>
9+
<a href="locales/README.ko.md">🇰🇷 한국어</a> |
10+
<a href="README.hi.md">🇮🇳 Hindi</a>
1011
</div>
1112

1213
<div style="text-align:center; margin:18px 0;">

scripts/translate.py

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"ru": "Russian",
1313
"pt": "Portuguese",
1414
"ko": "Korean",
15+
"hi": "Hindi",
1516
}
1617

1718
parser = argparse.ArgumentParser()
@@ -36,32 +37,38 @@
3637
# We replace complex blocks with placeholders so the LLM cannot mangle them.
3738
protected_blocks = []
3839

40+
# Change this in your translate.py
3941
def protect_match(match):
40-
placeholder = f"[PROTECTED_BLOCK_{len(protected_blocks)}]"
42+
# Use something clearly non-linguistic
43+
placeholder = f"[[PB_{len(protected_blocks)}]]"
4144
protected_blocks.append(match.group(0))
4245
return placeholder
4346

4447
text_to_translate = original_text
4548

46-
# 1. Protect Navigation Bar (<div align="center">...</div>)
47-
text_to_translate = re.sub(r'(<div align="center">.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL)
48-
# 2. Protect Logo Block (<div style="text-align:center...>)
49-
text_to_translate = re.sub(r'(<div style="text-align:center; margin:18px 0;">.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL)
50-
# 3. Protect Badges (![...](https://img.shields.io/...)) - Prevents URL translation
51-
text_to_translate = re.sub(r'(!\[.*?\]\(https://img\.shields\.io/.*?\))', protect_match, text_to_translate)
49+
# 1. Protect Navigation Bar (Robust regex for attributes and whitespace)
50+
text_to_translate = re.sub(r'(<div\s+[^>]*align=["\']center["\'][^>]*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
51+
# 2. Protect Logo Block (Robust regex for style attribute)
52+
text_to_translate = re.sub(r'(<div\s+[^>]*style=["\'][^"\']*text-align:\s*center[^"\']*["\'][^>]*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
53+
# 3. Protect Badges (Robust regex for shields.io URLs)
54+
text_to_translate = re.sub(r'(!\[[^\]]*\]\(https?://img\.shields\.io/[^\)]+\))', protect_match, text_to_translate, flags=re.IGNORECASE)
5255

5356
# Refined Prompt for CJK and Technical Nuance
5457
prompt = f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
55-
You are a professional technical translator. Translate this GitHub README into {target_lang_name}.
58+
You are a professional technical translator. Translate the provided README into professional developer-level {target_lang_name}.
5659
CRITICAL RULES:
57-
1. **Badges**: Do NOT translate text inside `![...]` or `(...)` for image badges.
58-
2. **Standard Terms**: Keep terms like GPU, VRAM, CLI, API, CUDA, and Docker in English.
59-
3. **Context**:
60+
1. **Badges**: Do NOT translate Markdown image syntax. Specifically, do NOT translate text inside square brackets ![...] or parentheses (...) for badge lines (e.g., license, python).
61+
2. **Navigation**: Do NOT modify the top-level HTML navigation bar (`<div align="center">`).
62+
3. **Context**: Treat 'Enforcement' as 'System policy restriction' and 'Headless' as 'server without GUI'.
63+
4. **Technical Integrity**: Preserve industry-standard terms (GPU, CLI, VRAM, SSH, Docker, API, CUDA) exactly as they appear in English.
64+
5. **Formatting**: Preserve all emojis and HTML/Markdown tags exactly.
65+
6. **No Talk**: Output ONLY the translated text. Do not include markdown code fences (```) around the entire output.<|END_OF_TURN_TOKEN|>
66+
7. **Context**:
6067
- 'Enforcement' = Policy restriction/application (JA: 制限/強制, ZH: 强制执行).
6168
- 'Headless' = Servers without a display (JA: ヘッドレス, ZH: 无头).
6269
- 'Agnostic' = Independence (JA: 非依存, ZH: 无关性).
63-
4. **Placeholders**: Return any text like '[PROTECTED_BLOCK_X]' exactly as is.
64-
5. **Output**: ONLY the translation. No conversational filler.<|END_OF_TURN_TOKEN|>
70+
8. **System Tags**: Return any text in the format [[PB_X]] exactly as is. These are code identifiers, NOT text. Do NOT translate the word 'PB' or change the brackets.
71+
6572
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
6673
{text_to_translate}<|END_OF_TURN_TOKEN|>
6774
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"""
@@ -73,13 +80,27 @@ def protect_match(match):
7380

7481
# 1. Restore Protected Blocks
7582
for i, block in enumerate(protected_blocks):
76-
translated_content = translated_content.replace(f"[PROTECTED_BLOCK_{i}]", block)
83+
# This regex looks for common translations or variations of your placeholder
84+
# It catches [[PB_0]], [[pb_0]], [ [PB_0] ], etc.
85+
tag_pattern = rf"\[\s*\[\s*PB_{i}\s*\]\s*\]"
86+
87+
# Check if the tag exists. If not, look for localized versions like [[BLOQUE_0]]
88+
if not re.search(tag_pattern, translated_content):
89+
# Fallback regex to find localized "Block" or "PB" followed by your index i
90+
# Use [^\]]* to avoid greedily matching across multiple tags if they appear on one line
91+
tag_pattern = rf"\[\s*\[\s*[^\]]*_{i}\s*\]\s*\]"
92+
93+
# Use lambda to avoid backslash escaping issues if the block content contains them
94+
translated_content = re.sub(tag_pattern, lambda m: block, translated_content)
7795

7896
# 2. Path Correction (Support single and double quotes)
79-
translated_content = re.sub(r'(\[.*?\]\()(?!(?:http|/|#|\.\./|locales/))', r'\1../', translated_content)
80-
translated_content = re.sub(r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./|locales/))', r'\1../', translated_content)
97+
# First, remove 'locales/' prefix if the LLM hallucinated it (so we can correctly prepend ../ later)
8198
translated_content = re.sub(r'(\[.*?\]\()locales/', r'\1', translated_content)
8299
translated_content = re.sub(r'((?:src|href)=["\'])locales/', r'\1', translated_content)
83100

101+
# Then, prepend ../ to relative paths
102+
translated_content = re.sub(r'(\[.*?\]\()(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
103+
translated_content = re.sub(r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
104+
84105
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
85106
f.write(translated_content)

0 commit comments

Comments
 (0)