1+ import os
2+ import re
3+ import argparse
4+ from llama_cpp import Llama
5+
6+ LANG_MAP = {
7+ "ja" : "Japanese" ,
8+ "zh" : "Chinese(Simplified)" ,
9+ "ko" : "Korean" ,
10+ "hi" : "Hindi" ,
11+ }
12+
13+ parser = argparse .ArgumentParser ()
14+ parser .add_argument ("--lang" , type = str , required = True )
15+ args = parser .parse_args ()
16+ target_lang_name = LANG_MAP .get (args .lang , "English" )
17+
18+ BASE_DIR = os .path .dirname (os .path .dirname (os .path .abspath (__file__ )))
19+ README_PATH = os .path .join (BASE_DIR , "README.md" )
20+ OUTPUT_DIR = os .path .join (BASE_DIR , "locales" )
21+ OUTPUT_PATH = os .path .join (OUTPUT_DIR , f"README.{ args .lang } .md" )
22+ MODEL_PATH = os .path .join (BASE_DIR , "models" , "aya-expanse-8b-q4_k_s.gguf" )
23+
24+ os .makedirs (OUTPUT_DIR , exist_ok = True )
25+ llm = Llama (model_path = MODEL_PATH , n_ctx = 6144 , n_threads = 2 , verbose = False )
26+
27+ with open (README_PATH , "r" , encoding = "utf-8" ) as f :
28+ original_text = f .read ()
29+
30+ # --- PRE-PROCESSING ---
31+ protected_blocks = []
32+
33+ def protect_match (match ):
34+ placeholder = f"__PB_{ len (protected_blocks )} __"
35+ protected_blocks .append (match .group (0 ))
36+ return placeholder
37+
38+ text_to_translate = original_text
39+
40+ # 1. Protect Navigation Bar
41+ text_to_translate = re .sub (r'(<div\s+[^>]*align=["\']center["\'][^>]*>.*?</div>)' , protect_match , text_to_translate , flags = re .DOTALL | re .IGNORECASE )
42+ # 2. Protect Logo Block
43+ text_to_translate = re .sub (r'(<div\s+[^>]*style=["\'][^"\']*text-align:\s*center[^"\']*["\'][^>]*>.*?</div>)' , protect_match , text_to_translate , flags = re .DOTALL | re .IGNORECASE )
44+ # 3. Protect ALL Images (Badges + Gallery)
45+ text_to_translate = re .sub (r'(!\[[^\]\r\n]*\]\([^)\r\n]+\))' , protect_match , text_to_translate )
46+
47+ # Specialized Prompt for CJK/Eastern Languages
48+ prompt = f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
49+ You are a professional technical { target_lang_name } translator. Translate the provided GitHub README into { target_lang_name } .
50+
51+ CRITICAL RULES:
52+ 1. **Placeholders**: You will see tags like __PB_0__, __PB_1__.
53+ - DO NOT translate them.
54+ - DO NOT remove them.
55+ - DO NOT convert underscores (_) to full-width characters. Keep them as it is.
56+ 2. **Formatting**: Preserve all Markdown structure exactly.
57+ 3. **Terminology**: Keep English technical terms (GPU, CLI, VRAM, Docker, CUDA) in English.
58+ 4. **Context**:
59+ - 'Enforcement' = Policy restriction (e.g., JA: 制限/強制).
60+ - 'Headless' = Server without display.
61+ 5. **Output**: ONLY the translated text. No explanations.
62+ <|END_OF_TURN_TOKEN|>
63+ <|START_OF_TURN_TOKEN|><|USER_TOKEN|>
64+ { text_to_translate } <|END_OF_TURN_TOKEN|>
65+ <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"""
66+
67+ response = llm (prompt , max_tokens = 6144 , temperature = 0 , stop = ["<|END_OF_TURN_TOKEN|>" ])
68+ translated_content = response ['choices' ][0 ]['text' ].strip ()
69+
70+ # --- POST-PROCESSING: Chain Restoration ---
71+
72+ for i , block in enumerate (protected_blocks ):
73+ placeholder = f"__PB_{ i } __"
74+
75+ # 1. Direct replacement
76+ if placeholder in translated_content :
77+ translated_content = translated_content .replace (placeholder , block )
78+ continue
79+
80+ # 2. Loose Regex Fallback (Handles CJK full-width issues like _PB_0_)
81+ # Matches __PB_0__, _PB_0_, [PB_0], etc.
82+ loose_pattern = re .compile (rf"[\[[]?\s*[___]+\s*PB_{ i } \s*[___]+\s*[\]]]?" , re .IGNORECASE )
83+ if loose_pattern .search (translated_content ):
84+ translated_content = loose_pattern .sub (lambda m : block , translated_content )
85+ continue
86+
87+ # 3. CRITICAL FALLBACK: Chain Insertion
88+ if i == 0 :
89+ translated_content = block + "\n \n " + translated_content
90+ else :
91+ prev_block = protected_blocks [i - 1 ]
92+ if prev_block in translated_content :
93+ # Insert current block immediately after the previous one
94+ translated_content = translated_content .replace (prev_block , prev_block + "\n " + block , 1 )
95+ else :
96+ translated_content = block + "\n \n " + translated_content
97+
98+ # 4. Path Correction
99+ # Remove 'locales/' hallucination
100+ translated_content = re .sub (r'(\[.*?\]\()locales/' , r'\1' , translated_content )
101+ translated_content = re .sub (r'((?:src|href)=["\'])locales/' , r'\1' , translated_content )
102+
103+ # Prepend ../ to relative paths
104+ translated_content = re .sub (r'(\[.*?\]\()(?!(?:http|/|#|\.\./))' , r'\1../' , translated_content )
105+ translated_content = re .sub (r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./))' , r'\1../' , translated_content )
106+
107+ # 5. Cleanup
108+ translated_content = re .sub (r'^<!--\s*|(?:\s*)?-->$' , '' , translated_content ).strip ()
109+ if translated_content .startswith ("```" ):
110+ lines = translated_content .splitlines ()
111+ if lines [0 ].startswith ("```" ): lines = lines [1 :]
112+ if lines and lines [- 1 ].strip ().startswith ("```" ): lines = lines [:- 1 ]
113+ translated_content = "\n " .join (lines ).strip ()
114+
115+ with open (OUTPUT_PATH , "w" , encoding = "utf-8" ) as f :
116+ f .write (translated_content )
0 commit comments