33import argparse
44from llama_cpp import Llama
55
6- # Map language codes to full English names for the system prompt
76LANG_MAP = {
8- "de" : "German" ,
9- "fr" : "French" ,
7+ "de" : "German" ,
8+ "fr" : "French" ,
109 "es" : "Spanish" ,
11- "ja" : "Japanese" ,
12- "zh" : "Simplified Chinese" ,
13- "ru" : "Russian" ,
14- "pt" : "Portuguese" ,
10+ "ja" : "Japanese" ,
11+ "zh" : "Chinese(Simplified) " ,
12+ "ru" : "Russian" ,
13+ "pt" : "Portuguese" ,
1514 "ko" : "Korean" ,
1615}
1716
1817parser = argparse .ArgumentParser ()
19- parser .add_argument ("--lang" , type = str , required = True , help = "Target language code (e.g., de, fr)" )
18+ parser .add_argument ("--lang" , type = str , required = True )
2019args = parser .parse_args ()
21-
2220target_lang_name = LANG_MAP .get (args .lang , "English" )
2321
22+ # Path Configuration
2423BASE_DIR = os .path .dirname (os .path .dirname (os .path .abspath (__file__ )))
2524README_PATH = os .path .join (BASE_DIR , "README.md" )
2625OUTPUT_DIR = os .path .join (BASE_DIR , "locales" )
2726OUTPUT_PATH = os .path .join (OUTPUT_DIR , f"README.{ args .lang } .md" )
2827MODEL_PATH = os .path .join (BASE_DIR , "models" , "aya-expanse-8b-q4_k_s.gguf" )
2928
30- # Ensure output directory exists
3129os .makedirs (OUTPUT_DIR , exist_ok = True )
32-
33- # Set n_ctx to 6144 as a safe middle ground for 8B model on 7GB RAM.
34- # Added n_threads=2 to match GitHub Action runner vCPUs.
3530llm = Llama (model_path = MODEL_PATH , n_ctx = 6144 , n_threads = 2 , verbose = False )
3631
3732with open (README_PATH , "r" , encoding = "utf-8" ) as f :
38- text_to_translate = f .read ()
33+ original_text = f .read ()
34+
35+ # --- PRE-PROCESSING: Protect Navigation Bar ---
36+ nav_match = re .search (r'(<div align="center">.*?</div>)' , original_text , re .DOTALL )
37+ nav_placeholder = "[NAV_BAR_PROTECTED_BLOCK]"
38+ text_to_translate = original_text
39+ if nav_match :
40+ text_to_translate = text_to_translate .replace (nav_match .group (1 ), nav_placeholder )
3941
40- # Aya Expanse specific header format system/user/chatbot turns
42+ # Refined Prompt for CJK and Technical Nuance
4143prompt = f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
42- You are a professional technical translator specializing in software documentation (GitHub READMEs).
43- Translate the provided README into professional developer-level { target_lang_name } .
44+ You are a professional technical translator. Translate this GitHub README into { target_lang_name } .
4445CRITICAL RULES:
45- 1. **Badges**: Do NOT translate Markdown image syntax. Specifically, do NOT translate text inside square brackets `![...]` or parentheses `(...)` for badge lines (e.g., license, python, version badges).
46- 2. **Navigation**: Do NOT modify the top-level HTML navigation bar (`<div align="center">`).
47- 3. **Context**: Treat 'Enforcement' as 'System policy restriction' and 'Headless' as 'server without GUI'.
48- 4. **Technical Integrity**: Preserve standard terms (GPU, CLI, VRAM, SSH, Docker) in English.
49- 5. **Formatting**: Preserve all emojis and HTML/Markdown tags exactly.
50- 6. **No Talk**: Output ONLY the translated text. Do not include markdown code fences (```) around the entire output.<|END_OF_TURN_TOKEN|>
46+ 1. **Badges**: Do NOT translate text inside `![...]` or `(...)` for image badges.
47+ 2. **Standard Terms**: Keep terms like GPU, VRAM, CLI, API, CUDA, and Docker in English.
48+ 3. **Context**:
49+ - 'Enforcement' = Policy restriction/application (JA: 制限/強制, ZH: 强制执行).
50+ - 'Headless' = Servers without a display (JA: ヘッドレス, ZH: 无头).
51+ - 'Agnostic' = Independence (JA: 非依存, ZH: 无关性).
52+ 4. **Placeholders**: Return any text like '{ nav_placeholder } ' exactly as is.
53+ 5. **Output**: ONLY the translation. No conversational filler.<|END_OF_TURN_TOKEN|>
5154<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
5255{ text_to_translate } <|END_OF_TURN_TOKEN|>
53- <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
54- """
55- # Do not translate translate badges. Keep them as it is.
56- # IMPORTANT: Do not strip or modify the top-level HTML tags (like <div> or <img>) at the beginning of the file.
57- # ONLY output the translated {target_lang_name} text. No talk, just translation.
58- # max_tokens must be less than n_ctx (6144). 4096 leaves ~2000 tokens for README input.
59- response = llm (
60- prompt ,
61- max_tokens = 6144 ,
62- temperature = 0 , # Set to 0 for maximum determinism in translation
63- stop = ["<|END_OF_TURN_TOKEN|>" , "<|START_OF_TURN_TOKEN|>" ]
64- )
56+ <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"""
6557
58+ response = llm (prompt , max_tokens = 6144 , temperature = 0 , stop = ["<|END_OF_TURN_TOKEN|>" ])
6659translated_content = response ['choices' ][0 ]['text' ].strip ()
6760
68- # 1. CLEANUP: Remove markdown code fences if the LLM included them
69- if translated_content .startswith ("```" ):
70- lines = translated_content .splitlines ()
71- if lines [0 ].startswith ("```" ):
72- lines = lines [1 :]
73- if lines and lines [- 1 ].strip ().startswith ("```" ):
74- lines = lines [:- 1 ]
75- translated_content = "\n " .join (lines ).strip ()
61+ # --- POST-PROCESSING ---
7662
77- # 2. FIX PATHS: Handle relative paths for files in /locales/
78- # We need to ensure that links to the root go up one level (../)
79- # but links to other files in the same /locales/ folder stay relative.
63+ # 1. Restore Navigation Bar
64+ if nav_match :
65+ translated_content = translated_content . replace ( nav_placeholder , nav_match . group ( 1 ))
8066
81- # Step 1: Prepend ../ to relative paths (ignoring external links, absolute paths, anchors, or locales/)
82- # This targets Markdown links/images text and HTML src="path"/href="path"
83- translated_content = re .sub (r'(\[.*?\]\()(?!(?:http|/|#|\.\./|locales/))' , r'\1../' , translated_content )
84- translated_content = re .sub (r'((?:src|href)=")(?!(?:http|/|#|\.\./|locales/))' , r'\1../' , translated_content )
67+ # 2. Advanced Badge Restoration (Key-based)
68+ # This handles cases where the LLM translates the URL parameters
69+ badge_keys = ["license" , "python" , "version" , "platform" , "cuda" ]
70+ for key in badge_keys :
71+ # Find the original badge line for this key
72+ orig_badge = re .search (rf'(!\[.*?\]\(https://img\.shields\.io/badge/{ key } .*?\))' , original_text , re .I )
73+ if orig_badge :
74+ # Find and replace the translated version in the output
75+ translated_content = re .sub (rf'!\[.*?\]\(https://img\.shields\.io/badge/{ key } .*?\)' , orig_badge .group (1 ), translated_content , flags = re .I )
8576
86- # Step 2: Handle links that point to the locales directory.
87- # Since the translated file is ALREADY in / locales/, we strip the 'locales/' prefix
88- # so they point to the sibling files in the same directory.
77+ # 3. Path Correction (Support single and double quotes)
78+ translated_content = re . sub ( r'(\[.*?\]\()(?!(?:http|/|#|\.\./| locales/))' , r'\1../' , translated_content )
79+ translated_content = re . sub ( r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./|locales/))' , r'\1../' , translated_content )
8980translated_content = re .sub (r'(\[.*?\]\()locales/' , r'\1' , translated_content )
90- translated_content = re .sub (r'((?:src|href)=")locales/' , r'\1' , translated_content )
91-
92- # 3. RESTORE BADGES: Ensure badges match the original English README exactly.
93- # This fixes cases where the LLM translates the Alt text (e.g., ![License] -> ![Lizenz])
94- # or slightly alters the URL.
95-
96- # Extract all shields.io badges from the original source text
97- original_badges = re .findall (r'(!\[.*?\]\(https://img\.shields\.io/.*?\))' , text_to_translate )
98-
99- for badge in original_badges :
100- # Extract the URL from the original badge to use as a key
101- match = re .search (r'\((https://img\.shields\.io/.*?)\)' , badge )
102- if match :
103- url = match .group (1 )
104- # Replace any markdown image in the translated text that has this URL
105- # with the exact original badge string.
106- translated_content = re .sub (rf'!\[.*?\]\({ re .escape (url )} \)' , lambda m : badge , translated_content )
81+ translated_content = re .sub (r'((?:src|href)=["\'])locales/' , r'\1' , translated_content )
10782
10883with open (OUTPUT_PATH , "w" , encoding = "utf-8" ) as f :
109- f .write (translated_content )
110-
111- print (f"Translation to { target_lang_name } complete: { OUTPUT_PATH } " )
84+ f .write (translated_content )
0 commit comments