Skip to content

Commit 2a15062

Browse files
committed
1 parent dc5ff9d commit 2a15062

File tree

2 files changed

+152
-2
lines changed

2 files changed

+152
-2
lines changed

translate_seg.ipynb

Lines changed: 152 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,153 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": null,
5+
"execution_count": 1,
66
"id": "5028def7",
77
"metadata": {},
88
"outputs": [],
9-
"source": []
9+
"source": [
10+
"import os\n",
11+
"import re\n",
12+
"import argparse\n",
13+
"from openai import OpenAI # pip install openai"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": 2,
19+
"id": "86763887",
20+
"metadata": {},
21+
"outputs": [],
22+
"source": [
23+
"# 1. Configuration\n",
24+
"LANG_MAP = {\n",
25+
" \"de\": \"German\", \"fr\": \"French\", \"es\": \"Spanish\", \"ja\": \"Japanese\", \n",
26+
" \"zh\": \"Chinese(Simplified)\", \"ru\": \"Russian\", \"pt\": \"Portuguese\", \n",
27+
" \"ko\": \"Korean\", \"hi\": \"Hindi\"\n",
28+
"}"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": 14,
34+
"id": "1d09235e",
35+
"metadata": {},
36+
"outputs": [],
37+
"source": [
38+
"LANG = \"de\"\n",
39+
"target_lang_name = LANG_MAP.get(LANG, \"English\")"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": null,
45+
"id": "cd1dedca",
46+
"metadata": {},
47+
"outputs": [],
48+
"source": [
49+
"# File Paths\n",
50+
"BASE_DIR = os.getcwd()\n",
51+
"README_PATH = os.path.join(BASE_DIR, \"README.md\")\n",
52+
"OUTPUT_PATH = os.path.join(BASE_DIR, \"locales\", f\"README.{LANG}.md\")\n",
53+
"\n",
54+
"os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"id": "297cf96f",
61+
"metadata": {},
62+
"outputs": [],
63+
"source": [
64+
"# Connect to LM Studio (Ensure port matches your LM Studio server settings)\n",
65+
"client = OpenAI(base_url=\"http://localhost:5432/v1\", api_key=\"lm-studio\")"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": null,
71+
"id": "db37de98",
72+
"metadata": {},
73+
"outputs": [],
74+
"source": [
75+
"with open(README_PATH, \"r\", encoding=\"utf-8\") as f:\n",
76+
" original_text = f.read()\n",
77+
"\n",
78+
"text_to_translate = original_text"
79+
]
80+
},
81+
{
82+
"cell_type": "code",
83+
"execution_count": null,
84+
"id": "865cb4aa",
85+
"metadata": {},
86+
"outputs": [],
87+
"source": [
88+
"# --- TRANSLATION via LM STUDIO ---\n",
89+
"prompt = f\"\"\"You are a precise, literal technical translator. Translate the README into professional {target_lang_name} while obeying the following STRICT rules.\n",
90+
"\n",
91+
"MANDATORY RULES (do not ignore):\n",
92+
"- CJK Token Separation: Do NOT merge or delete punctuation, spaces, or line breaks that sit between an HTML tag and target language text.\n",
93+
"- Visual Preservation: Treat every < and > as a hard boundary. Do not allow any Chinese or Japanese characters to \"leak\" inside a tag's brackets.\n",
94+
"- No Normalization: Do not attempt to \"fix\" the spacing of the original HTML to better fit {target_lang_name} grammar.\n",
95+
"- Preserve every top-level HTML tag, block, and attribute exactly as in the source. Do NOT remove, reorder, normalize, or change any tag name or attribute (including `style`, `align`, `src`, `class`, etc.).\n",
96+
"- Do NOT modify file paths, URLs, or filenames in attributes (e.g., `src`, `href`) in any way.\n",
97+
"- Only translate visible human text (text nodes) that appears between tags. Do NOT translate tag names, attributes, filenames, or any code fragments.\n",
98+
"- Preserve line breaks and indentation for any line that contains HTML tags. If you cannot translate a given line without changing its tags or attributes, leave that line unchanged.\n",
99+
"- For `<img>` tags and other self-closing tags, do NOT alter the tag; keep it exactly as-is.\n",
100+
"- For `<a>` tags, preserve the `href` attribute value exactly; translate only the link text.\n",
101+
"- Do NOT add or remove blank lines; maintain the same number and order of lines as the input.\n",
102+
"- Output ONLY the final translated Markdown document content. Do NOT include any explanations, notes, or code fences.\n",
103+
"\n",
104+
"EXAMPLES (input -> expected output):\n",
105+
"Input: <div align=\"center\">Welcome to <a href=\"../README.md\">Project</a></div>\n",
106+
"Output: <div align=\"center\">Bienvenido a <a href=\"../README.md\">Project</a></div>\n",
107+
"\n",
108+
"Input: <div style=\"margin:10px;\"><img src=\"../path/logo.png\" alt=\"Logo image\"/></div>\n",
109+
"Output: <div style=\"margin:10px;\"><img src=\"../path/logo.png\" alt=\"Logo image\"/></div>\n",
110+
"\n",
111+
"If you would need to remove, alter, or reformat any HTML tag or attribute to perform the translation, instead KEEP that original input line unchanged.\n",
112+
"\n",
113+
"PENALTY INSTRUCTION: If you remove or change any HTML tag or attribute, that is incorrect — in that case the correct output is the exact original file content.\n",
114+
"\"\"\""
115+
]
116+
},
117+
{
118+
"cell_type": "code",
119+
"execution_count": null,
120+
"id": "bafe7308",
121+
"metadata": {},
122+
"outputs": [],
123+
"source": [
124+
"response = client.chat.completions.create(\n",
125+
" model=\"aya-expanse-8b\",\n",
126+
" messages=[\n",
127+
" {\"role\": \"system\", \"content\": prompt},\n",
128+
" {\"role\": \"user\", \"content\": text_to_translate}\n",
129+
" ],\n",
130+
" temperature=0,\n",
131+
")\n",
132+
"translated_content = response.choices[0].message.content.strip()"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"execution_count": null,
138+
"id": "19af8f07",
139+
"metadata": {},
140+
"outputs": [],
141+
"source": [
142+
"# --- POST-PROCESSING: Fuzzy Restoration ---\n",
143+
"\n",
144+
"translated_content = re.sub(r'(\\[.*?\\]\\()(?!(?:http|/|#|\\.\\./))', r'\\1../', translated_content)\n",
145+
"translated_content = re.sub(r'((?:src|href)=[\"\\'])(?!(?:http|/|#|\\.\\./))', r'\\1../', translated_content)\n",
146+
"\n",
147+
"with open(OUTPUT_PATH, \"w\", encoding=\"utf-8\") as f:\n",
148+
" f.write(translated_content)\n",
149+
"\n",
150+
"print(f\"Done! Translated README saved to: {OUTPUT_PATH}\")"
151+
]
10152
}
11153
],
12154
"metadata": {
@@ -16,7 +158,15 @@
16158
"name": "python3"
17159
},
18160
"language_info": {
161+
"codemirror_mode": {
162+
"name": "ipython",
163+
"version": 3
164+
},
165+
"file_extension": ".py",
166+
"mimetype": "text/x-python",
19167
"name": "python",
168+
"nbconvert_exporter": "python",
169+
"pygments_lexer": "ipython3",
20170
"version": "3.12.12"
21171
}
22172
},

translate_seg.py

Whitespace-only changes.

0 commit comments

Comments
 (0)