|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": null, |
| 5 | + "execution_count": 1, |
6 | 6 | "id": "5028def7", |
7 | 7 | "metadata": {}, |
8 | 8 | "outputs": [], |
9 | | - "source": [] |
| 9 | + "source": [ |
| 10 | + "import os\n", |
| 11 | + "import re\n", |
| 12 | + "import argparse\n", |
| 13 | + "from openai import OpenAI # pip install openai" |
| 14 | + ] |
| 15 | + }, |
| 16 | + { |
| 17 | + "cell_type": "code", |
| 18 | + "execution_count": 2, |
| 19 | + "id": "86763887", |
| 20 | + "metadata": {}, |
| 21 | + "outputs": [], |
| 22 | + "source": [ |
| 23 | + "# 1. Configuration\n", |
| 24 | + "LANG_MAP = {\n", |
| 25 | + " \"de\": \"German\", \"fr\": \"French\", \"es\": \"Spanish\", \"ja\": \"Japanese\", \n", |
| 26 | + " \"zh\": \"Chinese(Simplified)\", \"ru\": \"Russian\", \"pt\": \"Portuguese\", \n", |
| 27 | + " \"ko\": \"Korean\", \"hi\": \"Hindi\"\n", |
| 28 | + "}" |
| 29 | + ] |
| 30 | + }, |
| 31 | + { |
| 32 | + "cell_type": "code", |
| 33 | + "execution_count": 14, |
| 34 | + "id": "1d09235e", |
| 35 | + "metadata": {}, |
| 36 | + "outputs": [], |
| 37 | + "source": [ |
| 38 | + "LANG = \"de\"\n", |
| 39 | + "target_lang_name = LANG_MAP.get(LANG, \"English\")" |
| 40 | + ] |
| 41 | + }, |
| 42 | + { |
| 43 | + "cell_type": "code", |
| 44 | + "execution_count": null, |
| 45 | + "id": "cd1dedca", |
| 46 | + "metadata": {}, |
| 47 | + "outputs": [], |
| 48 | + "source": [ |
| 49 | + "# File Paths\n", |
| 50 | + "BASE_DIR = os.getcwd()\n", |
| 51 | + "README_PATH = os.path.join(BASE_DIR, \"README.md\")\n", |
| 52 | + "OUTPUT_PATH = os.path.join(BASE_DIR, \"locales\", f\"README.{LANG}.md\")\n", |
| 53 | + "\n", |
| 54 | + "os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)" |
| 55 | + ] |
| 56 | + }, |
| 57 | + { |
| 58 | + "cell_type": "code", |
| 59 | + "execution_count": null, |
| 60 | + "id": "297cf96f", |
| 61 | + "metadata": {}, |
| 62 | + "outputs": [], |
| 63 | + "source": [ |
| 64 | + "# Connect to LM Studio (Ensure port matches your LM Studio server settings)\n", |
| 65 | + "client = OpenAI(base_url=\"http://localhost:5432/v1\", api_key=\"lm-studio\")" |
| 66 | + ] |
| 67 | + }, |
| 68 | + { |
| 69 | + "cell_type": "code", |
| 70 | + "execution_count": null, |
| 71 | + "id": "db37de98", |
| 72 | + "metadata": {}, |
| 73 | + "outputs": [], |
| 74 | + "source": [ |
| 75 | + "with open(README_PATH, \"r\", encoding=\"utf-8\") as f:\n", |
| 76 | + " original_text = f.read()\n", |
| 77 | + "\n", |
| 78 | + "text_to_translate = original_text" |
| 79 | + ] |
| 80 | + }, |
| 81 | + { |
| 82 | + "cell_type": "code", |
| 83 | + "execution_count": null, |
| 84 | + "id": "865cb4aa", |
| 85 | + "metadata": {}, |
| 86 | + "outputs": [], |
| 87 | + "source": [ |
| 88 | + "# --- TRANSLATION via LM STUDIO ---\n", |
| 89 | + "prompt = f\"\"\"You are a precise, literal technical translator. Translate the README into professional {target_lang_name} while obeying the following STRICT rules.\n", |
| 90 | + "\n", |
| 91 | + "MANDATORY RULES (do not ignore):\n", |
| 92 | + "- CJK Token Separation: Do NOT merge or delete punctuation, spaces, or line breaks that sit between an HTML tag and target language text.\n", |
| 93 | + "- Visual Preservation: Treat every < and > as a hard boundary. Do not allow any Chinese or Japanese characters to \"leak\" inside a tag's brackets.\n", |
| 94 | + "- No Normalization: Do not attempt to \"fix\" the spacing of the original HTML to better fit {target_lang_name} grammar.\n", |
| 95 | + "- Preserve every top-level HTML tag, block, and attribute exactly as in the source. Do NOT remove, reorder, normalize, or change any tag name or attribute (including `style`, `align`, `src`, `class`, etc.).\n", |
| 96 | + "- Do NOT modify file paths, URLs, or filenames in attributes (e.g., `src`, `href`) in any way.\n", |
| 97 | + "- Only translate visible human text (text nodes) that appears between tags. Do NOT translate tag names, attributes, filenames, or any code fragments.\n", |
| 98 | + "- Preserve line breaks and indentation for any line that contains HTML tags. If you cannot translate a given line without changing its tags or attributes, leave that line unchanged.\n", |
| 99 | + "- For `<img>` tags and other self-closing tags, do NOT alter the tag; keep it exactly as-is.\n", |
| 100 | + "- For `<a>` tags, preserve the `href` attribute value exactly; translate only the link text.\n", |
| 101 | + "- Do NOT add or remove blank lines; maintain the same number and order of lines as the input.\n", |
| 102 | + "- Output ONLY the final translated Markdown document content. Do NOT include any explanations, notes, or code fences.\n", |
| 103 | + "\n", |
| 104 | + "EXAMPLES (input -> expected output):\n", |
| 105 | + "Input: <div align=\"center\">Welcome to <a href=\"../README.md\">Project</a></div>\n", |
| 106 | + "Output: <div align=\"center\">Bienvenido a <a href=\"../README.md\">Project</a></div>\n", |
| 107 | + "\n", |
| 108 | + "Input: <div style=\"margin:10px;\"><img src=\"../path/logo.png\" alt=\"Logo image\"/></div>\n", |
| 109 | + "Output: <div style=\"margin:10px;\"><img src=\"../path/logo.png\" alt=\"Logo image\"/></div>\n", |
| 110 | + "\n", |
| 111 | + "If you would need to remove, alter, or reformat any HTML tag or attribute to perform the translation, instead KEEP that original input line unchanged.\n", |
| 112 | + "\n", |
| 113 | + "PENALTY INSTRUCTION: If you remove or change any HTML tag or attribute, that is incorrect — in that case the correct output is the exact original file content.\n", |
| 114 | + "\"\"\"" |
| 115 | + ] |
| 116 | + }, |
| 117 | + { |
| 118 | + "cell_type": "code", |
| 119 | + "execution_count": null, |
| 120 | + "id": "bafe7308", |
| 121 | + "metadata": {}, |
| 122 | + "outputs": [], |
| 123 | + "source": [ |
| 124 | + "response = client.chat.completions.create(\n", |
| 125 | + " model=\"aya-expanse-8b\",\n", |
| 126 | + " messages=[\n", |
| 127 | + " {\"role\": \"system\", \"content\": prompt},\n", |
| 128 | + " {\"role\": \"user\", \"content\": text_to_translate}\n", |
| 129 | + " ],\n", |
| 130 | + " temperature=0,\n", |
| 131 | + ")\n", |
| 132 | + "translated_content = response.choices[0].message.content.strip()" |
| 133 | + ] |
| 134 | + }, |
| 135 | + { |
| 136 | + "cell_type": "code", |
| 137 | + "execution_count": null, |
| 138 | + "id": "19af8f07", |
| 139 | + "metadata": {}, |
| 140 | + "outputs": [], |
| 141 | + "source": [ |
| 142 | + "# --- POST-PROCESSING: Fuzzy Restoration ---\n", |
| 143 | + "\n", |
| 144 | + "translated_content = re.sub(r'(\\[.*?\\]\\()(?!(?:http|/|#|\\.\\./))', r'\\1../', translated_content)\n", |
| 145 | + "translated_content = re.sub(r'((?:src|href)=[\"\\'])(?!(?:http|/|#|\\.\\./))', r'\\1../', translated_content)\n", |
| 146 | + "\n", |
| 147 | + "with open(OUTPUT_PATH, \"w\", encoding=\"utf-8\") as f:\n", |
| 148 | + " f.write(translated_content)\n", |
| 149 | + "\n", |
| 150 | + "print(f\"Done! Translated README saved to: {OUTPUT_PATH}\")" |
| 151 | + ] |
10 | 152 | } |
11 | 153 | ], |
12 | 154 | "metadata": { |
|
16 | 158 | "name": "python3" |
17 | 159 | }, |
18 | 160 | "language_info": { |
| 161 | + "codemirror_mode": { |
| 162 | + "name": "ipython", |
| 163 | + "version": 3 |
| 164 | + }, |
| 165 | + "file_extension": ".py", |
| 166 | + "mimetype": "text/x-python", |
19 | 167 | "name": "python", |
| 168 | + "nbconvert_exporter": "python", |
| 169 | + "pygments_lexer": "ipython3", |
20 | 170 | "version": "3.12.12" |
21 | 171 | } |
22 | 172 | }, |
|
0 commit comments