DataBoySu
diff --git a/‎translate_seg.ipynb‎
Lines changed: 152 additions & 2 deletions b/‎translate_seg.ipynb‎
Lines changed: 152 additions & 2 deletions
diff --git a/‎translate_seg.py‎ b/‎translate_seg.py‎
@@ -2,11 +2,153 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "5028def7",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import argparse\n",
+    "from openai import OpenAI  # pip install openai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "86763887",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. Configuration\n",
+    "LANG_MAP = {\n",
+    "    \"de\": \"German\", \"fr\": \"French\", \"es\": \"Spanish\", \"ja\": \"Japanese\", \n",
+    "    \"zh\": \"Chinese(Simplified)\", \"ru\": \"Russian\", \"pt\": \"Portuguese\", \n",
+    "    \"ko\": \"Korean\", \"hi\": \"Hindi\"\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "1d09235e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "LANG = \"de\"\n",
+    "target_lang_name = LANG_MAP.get(LANG, \"English\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd1dedca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# File Paths\n",
+    "BASE_DIR = os.getcwd()\n",
+    "README_PATH = os.path.join(BASE_DIR, \"README.md\")\n",
+    "OUTPUT_PATH = os.path.join(BASE_DIR, \"locales\", f\"README.{LANG}.md\")\n",
+    "\n",
+    "os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "297cf96f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Connect to LM Studio (Ensure port matches your LM Studio server settings)\n",
+    "client = OpenAI(base_url=\"http://localhost:5432/v1\", api_key=\"lm-studio\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db37de98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(README_PATH, \"r\", encoding=\"utf-8\") as f:\n",
+    "    original_text = f.read()\n",
+    "\n",
+    "text_to_translate = original_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "865cb4aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- TRANSLATION via LM STUDIO ---\n",
+    "prompt = f\"\"\"You are a precise, literal technical translator. Translate the README into professional {target_lang_name} while obeying the following STRICT rules.\n",
+    "\n",
+    "MANDATORY RULES (do not ignore):\n",
+    "- CJK Token Separation: Do NOT merge or delete punctuation, spaces, or line breaks that sit between an HTML tag and target language text.\n",
+    "- Visual Preservation: Treat every < and > as a hard boundary. Do not allow any Chinese or Japanese characters to \"leak\" inside a tag's brackets.\n",
+    "- No Normalization: Do not attempt to \"fix\" the spacing of the original HTML to better fit {target_lang_name} grammar.\n",
+    "- Preserve every top-level HTML tag, block, and attribute exactly as in the source. Do NOT remove, reorder, normalize, or change any tag name or attribute (including `style`, `align`, `src`, `class`, etc.).\n",
+    "- Do NOT modify file paths, URLs, or filenames in attributes (e.g., `src`, `href`) in any way.\n",
+    "- Only translate visible human text (text nodes) that appears between tags. Do NOT translate tag names, attributes, filenames, or any code fragments.\n",
+    "- Preserve line breaks and indentation for any line that contains HTML tags. If you cannot translate a given line without changing its tags or attributes, leave that line unchanged.\n",
+    "- For `<img>` tags and other self-closing tags, do NOT alter the tag; keep it exactly as-is.\n",
+    "- For `<a>` tags, preserve the `href` attribute value exactly; translate only the link text.\n",
+    "- Do NOT add or remove blank lines; maintain the same number and order of lines as the input.\n",
+    "- Output ONLY the final translated Markdown document content. Do NOT include any explanations, notes, or code fences.\n",
+    "\n",
+    "EXAMPLES (input -> expected output):\n",
+    "Input:  <div align=\"center\">Welcome to <a href=\"../README.md\">Project</a></div>\n",
+    "Output: <div align=\"center\">Bienvenido a <a href=\"../README.md\">Project</a></div>\n",
+    "\n",
+    "Input:  <div style=\"margin:10px;\"><img src=\"../path/logo.png\" alt=\"Logo image\"/></div>\n",
+    "Output: <div style=\"margin:10px;\"><img src=\"../path/logo.png\" alt=\"Logo image\"/></div>\n",
+    "\n",
+    "If you would need to remove, alter, or reformat any HTML tag or attribute to perform the translation, instead KEEP that original input line unchanged.\n",
+    "\n",
+    "PENALTY INSTRUCTION: If you remove or change any HTML tag or attribute, that is incorrect — in that case the correct output is the exact original file content.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bafe7308",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"aya-expanse-8b\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": prompt},\n",
+    "        {\"role\": \"user\", \"content\": text_to_translate}\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    ")\n",
+    "translated_content = response.choices[0].message.content.strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19af8f07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- POST-PROCESSING: Fuzzy Restoration ---\n",
+    "\n",
+    "translated_content = re.sub(r'(\\[.*?\\]\\()(?!(?:http|/|#|\\.\\./))', r'\\1../', translated_content)\n",
+    "translated_content = re.sub(r'((?:src|href)=[\"\\'])(?!(?:http|/|#|\\.\\./))', r'\\1../', translated_content)\n",
+    "\n",
+    "with open(OUTPUT_PATH, \"w\", encoding=\"utf-8\") as f:\n",
+    "    f.write(translated_content)\n",
+    "\n",
+    "print(f\"Done! Translated README saved to: {OUTPUT_PATH}\")"
+   ]
   }
  ],
  "metadata": {
@@ -16,7 +158,15 @@
    "name": "python3"
   },
   "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
    "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
    "version": "3.12.12"
   }
  },