prompt fix

DataBoySu · DataBoySu · commit b4a27aa7655c · 2025-12-26T19:34:13.000+05:30
diff --git a/scripts/translate_eastern.py b/scripts/translate_eastern.py
@@ -27,40 +27,24 @@
 with open(README_PATH, "r", encoding="utf-8") as f:
     original_text = f.read()
 
-# --- PRE-PROCESSING ---
-protected_blocks = []
-
-def protect_match(match):
-    placeholder = f"__PB_{len(protected_blocks)}__" 
-    protected_blocks.append(match.group(0))
-    return placeholder
-
 text_to_translate = original_text
 
-# 1. Protect Navigation Bar (Whole Block)
-text_to_translate = re.sub(r'(<div\s+align=["\']center["\']\s*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
-# 2. Protect Logo (Whole Block)
-text_to_translate = re.sub(r'(<div\s+style=["\'][^"]*text-align:center[^"]*["\']\s*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
-# 3. Protect Markdown Images (Badges, Screenshots)
-text_to_translate = re.sub(r'(!\[[^\]\r\n]*\]\([^)\r\n]+\))', protect_match, text_to_translate)
-# 4. Protect HTML Tags (Preserves Gallery structure <details>, <summary>, <img> but exposes text)
-text_to_translate = re.sub(r'(<[^>]+>)', protect_match, text_to_translate)
-
 # Specialized Prompt for CJK/Eastern Languages
 prompt = f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
-You are a professional technical {target_lang_name} translator. Translate the provided GitHub README into {target_lang_name}.
+You are a professional technical {target_lang_name} translator. 
+Translate the provided GitHub README into {target_lang_name}.
 
 CRITICAL RULES:
-1. **Placeholders**: You will see tags like __PB_0__, __PB_1__. 
-   - DO NOT translate them.
-   - DO NOT remove them.
-   - DO NOT convert underscores (_) to full-width characters. Keep them as it is.
-2. **Formatting**: Preserve all Markdown structure exactly.
-3. **Terminology**: Keep English technical terms (GPU, CLI, VRAM, Docker, CUDA) in English.
-4. **Context**: 
+1. **Navigation Bar**: The top HTML block (`<div align="center">...</div>`) containing language links MUST remain EXACTLY the same. Do NOT translate the filenames or language names inside it. Do not change anything in it.
+2. **Badges**: Do NOT translate the text inside badge links (e.g., `![License]`, `![Python]`). Keep the URLs exactly as is.
+3. **Logo**: Keep the logo HTML block (`<div style="text-align:center...`) exactly as is.
+4. **Formatting**: Preserve all emojis and HTML/Markdown tags exactly.
+5. **Terminology**: Keep English technical terms (GPU, CLI, VRAM, Docker, CUDA) in English.
+6. **Context**: 
    - 'Enforcement' = Policy restriction (e.g., JA: 制限/強制).
-   - 'Headless' = Server without display.
-5. **Output**: ONLY the translated text. No explanations.
+   - 'Headless' = Server without GUI/display.
+   - 'Agnostic' = Hardware Independence (JA: 非依存, ZH: 无关性).
+7. **Output**: ONLY the translated text. No explanations.
 <|END_OF_TURN_TOKEN|>
 <|START_OF_TURN_TOKEN|><|USER_TOKEN|>
 {text_to_translate}<|END_OF_TURN_TOKEN|>
@@ -69,46 +53,20 @@ def protect_match(match):
 response = llm(prompt, max_tokens=6144, temperature=0, stop=["<|END_OF_TURN_TOKEN|>"])
 translated_content = response['choices'][0]['text'].strip()
 
-# --- POST-PROCESSING: Chain Restoration ---
-
-for i, block in enumerate(protected_blocks):
-    placeholder = f"__PB_{i}__"
-    
-    # 1. Direct replacement
-    if placeholder in translated_content:
-        translated_content = translated_content.replace(placeholder, block)
-        continue
-    
-    # 2. Loose Regex Fallback (Handles CJK full-width issues like ＿PB＿0＿)
-    # Matches __PB_0__, ＿PB_0＿, [PB_0], etc.
-    loose_pattern = re.compile(rf"[\[［]?\s*[__＿]+\s*PB_{i}\s*[__＿]+\s*[\]］]?", re.IGNORECASE)
-    if loose_pattern.search(translated_content):
-        translated_content = loose_pattern.sub(lambda m: block, translated_content)
-        continue
-
-    # 3. CRITICAL FALLBACK: Chain Insertion
-    if i == 0: 
-        translated_content = block + "\n\n" + translated_content
-    else:
-        prev_block = protected_blocks[i-1]
-        if prev_block in translated_content:
-            # Insert current block immediately after the previous one
-            translated_content = translated_content.replace(prev_block, prev_block + "\n" + block, 1)
-        else:
-            translated_content = block + "\n\n" + translated_content
+# --- POST-PROCESSING ---
+# 1. CLEANUP: Remove markdown code fences if the LLM included them
+if translated_content.startswith("```"):
+    lines = translated_content.splitlines()
+    if lines[0].startswith("```"):
+        lines = lines[1:]
+    if lines and lines[-1].strip().startswith("```"):
+        lines = lines[:-1]
+    translated_content = "\n".join(lines).strip()
 
-# 4. Path Correction
+# 2. Path Correction
 # Prepend ../ to relative paths
 translated_content = re.sub(r'(\[.*?\]\()(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
 translated_content = re.sub(r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
 
-# 5. Cleanup
-translated_content = re.sub(r'^<!--\s*|(?:\s*)?-->$', '', translated_content).strip()
-if translated_content.startswith("```"):
-    lines = translated_content.splitlines()
-    if lines[0].startswith("```"): lines = lines[1:]
-    if lines and lines[-1].strip().startswith("```"): lines = lines[:-1]
-    translated_content = "\n".join(lines).strip()
-
 with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
     f.write(translated_content)
diff --git a/scripts/translate_western.py b/scripts/translate_western.py
@@ -27,33 +27,20 @@
 with open(README_PATH, "r", encoding="utf-8") as f:
     original_text = f.read()
 
-# --- PRE-PROCESSING: Protect Sensitive Blocks ---
-protected_blocks = []
-
-def protect_match(match):
-    # Use underscores to look like code variables, which LLMs respect more
-    placeholder = f"__PB_{len(protected_blocks)}__" 
-    protected_blocks.append(match.group(0))
-    return placeholder
+# --- PRE-PROCESSING---
 
 text_to_translate = original_text
 
-# 1. Protect Navigation Bar (Whole Block)
-text_to_translate = re.sub(r'(<div\s+align=["\']center["\']\s*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
-# 2. Protect Logo (Whole Block)
-text_to_translate = re.sub(r'(<div\s+style=["\'][^"]*text-align:center[^"]*["\']\s*>.*?</div>)', protect_match, text_to_translate, flags=re.DOTALL | re.IGNORECASE)
-# 3. Protect Markdown Images (Badges, Screenshots)
-text_to_translate = re.sub(r'(!\[[^\]\r\n]*\]\([^)\r\n]+\))', protect_match, text_to_translate)
-# 4. Protect HTML Tags (Preserves Gallery structure <details>, <summary>, <img> but exposes text)
-text_to_translate = re.sub(r'(<[^>]+>)', protect_match, text_to_translate)
-
 prompt = f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
 You are a professional technical {target_lang_name} translator. Translate the provided README into professional developer-level {target_lang_name}.
+
 CRITICAL RULES:
-1. **Structure**: Keep the layout exactly the same.
-2. **Placeholders**: You will see placeholders like __PB_0__, __PB_1__. These are images or layout blocks. KEEP THEM EXACTLY AS IS. Do not move or translate them.
-3. **Terminology**: Preserve terms like GPU, CLI, VRAM, SSH, Docker, API, CUDA.
-4. **No Talk**: Output ONLY the translated text.
+1. **Navigation Bar**: The top HTML block (`<div align="center">...</div>`) containing language links MUST remain EXACTLY the same. Do NOT translate the filenames or language names inside it.
+2. **Badges**: Do NOT translate the text inside badge links (e.g., `![License]`, `![Python]`). Keep the URLs exactly as is.
+3. **Logo**: Keep the logo HTML block (`<div style="text-align:center...`) exactly as is.
+4. **Formatting**: Preserve all emojis and HTML/Markdown tags exactly.
+5. **Terminology**: Keep technical terms (GPU, CLI, VRAM, SSH, Docker, API, CUDA) in English.
+6. **No Talk**: Output ONLY the translated text. Do not wrap the output in markdown code fences (```).
 <|END_OF_TURN_TOKEN|>
 <|START_OF_TURN_TOKEN|><|USER_TOKEN|>
 {text_to_translate}<|END_OF_TURN_TOKEN|>
@@ -62,49 +49,21 @@ def protect_match(match):
 response = llm(prompt, max_tokens=6144, temperature=0, stop=["<|END_OF_TURN_TOKEN|>"])
 translated_content = response['choices'][0]['text'].strip()
 
-# --- POST-PROCESSING: Chain Restoration ---
-
-for i, block in enumerate(protected_blocks):
-    placeholder = f"__PB_{i}__"
-    
-    # 1. Direct replacement (Best case)
-    if placeholder in translated_content:
-        translated_content = translated_content.replace(placeholder, block)
-        continue
-    
-    # 2. Loose Regex Fallback (Handles spacing issues)
-    loose_pattern = re.compile(rf"\[?\s*__\s*PB_{i}\s*__\s*\]?", re.IGNORECASE)
-    if loose_pattern.search(translated_content):
-        translated_content = loose_pattern.sub(lambda m: block, translated_content)
-        continue
+# --- POST-PROCESSING ---
+# 1. CLEANUP: Remove markdown code fences if the LLM included them
+if translated_content.startswith("```"):
+    lines = translated_content.splitlines()
+    if lines[0].startswith("```"):
+        lines = lines[1:]
+    if lines and lines[-1].strip().startswith("```"):
+        lines = lines[:-1]
+    translated_content = "\n".join(lines).strip()
 
-    # 3. CRITICAL FALLBACK: Chain Insertion
-    # If a block is missing, insert it immediately after the previous block.
-    # This ensures Nav -> Logo -> Badge1 -> Badge2 order is preserved even if the LLM drops them.
-    
-    if i == 0: 
-        # Nav missing? Prepend to file.
-        translated_content = block + "\n\n" + translated_content
-    else:
-        # Insert after the previous block (which is guaranteed to be in the text now)
-        prev_block = protected_blocks[i-1]
-        if prev_block in translated_content:
-            # Replace the previous block with "Previous + New"
-            # We use a specific check to avoid duplicating if the previous block appears multiple times (unlikely for these headers)
-            translated_content = translated_content.replace(prev_block, prev_block + "\n" + block, 1)
-        else:
-            # If previous block is somehow missing (shouldn't happen due to loop order), just prepend
-            translated_content = block + "\n\n" + translated_content
 
-# 4. Path Correction
+# 2. Path Correction
 # Prepend ../ to relative paths
 translated_content = re.sub(r'(\[.*?\]\()(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
 translated_content = re.sub(r'((?:src|href)=["\'])(?!(?:http|/|#|\.\./))', r'\1../', translated_content)
 
-# 5. Cleanup
-translated_content = re.sub(r'^<!--\s*|(?:\s*)?-->$', '', translated_content).strip()
-if translated_content.startswith("```"):
-    translated_content = "\n".join(translated_content.splitlines()[1:-1]).strip()
-
 with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
     f.write(translated_content)