Skip to content

Commit 8a42f02

Browse files
committed
feat: add scripts to translate into vietnamese
1 parent c7998ab commit 8a42f02

File tree

2 files changed

+210
-0
lines changed

2 files changed

+210
-0
lines changed

scripts/translation.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import os
2+
import sys
3+
from huggingface_hub import InferenceClient
4+
from dotenv import load_dotenv
5+
load_dotenv()
6+
7+
8+
hf_token = os.environ.get("HF_TOKEN")
9+
if not hf_token:
10+
raise ValueError("HF_TOKEN not found in environment variables. Please set it in a .env file.")
11+
12+
13+
# Get the directory containing the current script
14+
script_dir = os.path.dirname(os.path.abspath(__file__))
15+
default_inp_dir = os.path.join(script_dir, '..', 'units/en')
16+
default_model = "deepseek-ai/DeepSeek-R1"
17+
default_client = InferenceClient(
18+
provider="together",
19+
# api_key is read from the environment
20+
)
21+
22+
def auto_translate(
23+
output_lang: str,
24+
prompt: callable,
25+
inp_dir: str = default_inp_dir,
26+
model: str = default_model,
27+
client: InferenceClient = default_client
28+
):
29+
get_output_path = lambda x: x.replace('/en', f'/{output_lang}')
30+
escape_special_tokens = lambda x: x.replace('<think>', '<%%think%%>').replace('</think>', '<%%/think%%>')
31+
unescape_special_tokens = lambda x: x.replace('<%%think%%>', '<think>').replace('<%%/think%%>', '</think>')
32+
33+
# Get the list of all files in the directory, recursively
34+
inp_files: list[str] = []
35+
print('Collecting files...')
36+
for root, dirs, files in os.walk(inp_dir):
37+
for file in files:
38+
if file.endswith('.mdx') or file == "_toctree.yml":
39+
fname = os.path.join(root, file)
40+
print(' +', fname)
41+
inp_files.append(fname)
42+
43+
def write_out_file(fpath: str, content: str):
44+
base_path = os.path.dirname(fpath)
45+
os.makedirs(base_path, exist_ok=True)
46+
with open(fpath, 'w', encoding='utf-8') as f:
47+
f.write(content)
48+
49+
# Read the content of the file and process
50+
for i, inp_file in enumerate(inp_files):
51+
out_file = get_output_path(inp_file)
52+
if os.path.exists(out_file):
53+
print(f'[{i+1}/{len(inp_files)}] Skipping file: {inp_file}')
54+
continue
55+
with open(inp_file, 'r', encoding='utf-8') as f:
56+
content: str = f.read()
57+
content = escape_special_tokens(content)
58+
if content.strip() == "":
59+
print(f'[{i+1}/{len(inp_files)}] Skipping empty file: {inp_file}')
60+
write_out_file(out_file, "")
61+
continue
62+
63+
print(f'[{i+1}/{len(inp_files)}] Processing file: {inp_file}')
64+
stream = client.chat.completions.create(
65+
model=model,
66+
temperature=0.0,
67+
messages=[
68+
{"role": "user", "content": prompt(content)},
69+
],
70+
stream=True,
71+
)
72+
final_text = ""
73+
for chunk in stream:
74+
content_chunk = chunk.choices[0].delta.content
75+
print(content_chunk, end="", flush=True)
76+
final_text += content_chunk
77+
# Optionally filter <think>...</think> reasoning process
78+
final_text = final_text.split('</think>').pop().strip()
79+
# Write the output to the file
80+
final_text = unescape_special_tokens(final_text)
81+
write_out_file(out_file, final_text)
82+
print()
83+
print(f' -> Translated to: {out_file}')
84+
print("--" * 20)
85+
#break

scripts/vi.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
from translation import auto_translate
2+
3+
output_lang = "vi"
4+
5+
# Fix the prompt function to escape curly braces in the content
6+
prompt = lambda content: f'''
7+
You are a translator for the Vietnamese translation team. You are tasked with translating the following texts into Vietnamese. You must follow these instructions:
8+
- Translate the texts into Vietnamese, while keeping the original formatting (either Markdown, MDX or HTML)
9+
- Inside code blocks, translate the comments but leave the code as-is; If the code block contains quite plain texts, you MUST provide the translation in <details> tag
10+
- Do not translate inline code, the URLs and file paths
11+
- If the term is abbreviated, keep the original term and provide the translation in parentheses for the first time it appears in the text
12+
- If there are any slag or funny joke in english, keep it (do not translate) and give an explanation so Vietnamese reader can understand
13+
- Use "ta", "mình, "chúng ta", "chúng mình", "các bạn" as pronouns
14+
15+
KEEP THESE TERMS (DO NOT TRANSLATE, do NOT add translation in parentheses): MCP, API, SDK, CLI, HTML, GGUF, AI, Client, Server, Hugging Face, Space, CodeAgent, LangGraph, LangChain, Llama, Gemma, inference, notebook, python, transformers, token, pretrain, format, certificate.
16+
17+
For these terms, use the pre-defined translation:
18+
- Quick Quiz: Kiểm tra nhanh
19+
- Unit: Chương
20+
- Bonus Unit: Chương bổ trợ
21+
- Module: Mô-đun
22+
- Lesson ...: Bài ...
23+
- Model: Mô hình
24+
- Dataset: Tập dữ liệu
25+
- Course: Khóa học
26+
- state-of-the-art: nổi tiếng
27+
- Q&A: Hỏi và Đáp
28+
- Dummy: ảo (or "giả", or "thử" depending on the context)
29+
- onboarding: làm quen
30+
- Hands-on: Thực hành
31+
- Challenge: Bài tập lớn
32+
- Training: Huấn luyện
33+
- Model Context Protocol: Giao Thức Ngữ Cảnh Mô Hình
34+
35+
Here is an example:
36+
- Original text: [Agents Course](https://huggingface.co/learn/agents-course/) will guide you through building AI agents with LLMs.
37+
- Translation: [Agents Course](https://huggingface.co/learn/agents-course/) sẽ hướng dẫn các bạn cách xây dựng AI Agents với LLMs.
38+
39+
Here is another example:
40+
- Original text: JSON-RPC defines the message format, but MCP also specifies how these messages are transported between Clients and Servers.
41+
- Translation: JSON-RPC định nghĩa định dạng tin nhắn, nhưng MCP cũng chỉ định cách thức các tin nhắn này được truyền tải giữa Máy khách và Máy chủ.
42+
43+
If the code block contains many plain texts, prove translation in collapsible <details> tag. Example:
44+
- Original text:
45+
```python
46+
def get_weather(location: str) -> dict:
47+
"""Get the current weather for a specified location."""
48+
# Connect to weather API and fetch data
49+
return {{
50+
"temperature": 72,
51+
"conditions": "Sunny",
52+
"humidity": 45
53+
}}
54+
```
55+
- Translation (add the <details> collapsible ABOVE of the original code block):
56+
<details>
57+
<summary>Bấm để xem bản dịch tiếng Việt</summary>
58+
```
59+
def get_weather(location: str) -> dict:
60+
"""Nhận thông tin thời tiết hiện tại ở một địa điểm cụ thể."""
61+
# Connect to weather API and fetch data
62+
return {{
63+
"temperature": 72,
64+
"conditions": "Sunny",
65+
"humidity": 45
66+
}}
67+
```
68+
</details>
69+
```
70+
def get_weather(location: str) -> dict:
71+
"""Get the current weather for a specified location."""
72+
# Connect to weather API and fetch data
73+
return {{
74+
"temperature": 72,
75+
"conditions": "Sunny",
76+
"humidity": 45
77+
}}
78+
```
79+
80+
If the code block does not contain any plain texts or comments, leave it as it is. Example:
81+
- Original text:
82+
```json
83+
{{
84+
"servers": [
85+
{{
86+
"name": "File Explorer",
87+
"transport": {{
88+
"type": "stdio",
89+
"command": "python",
90+
"args": ["/path/to/file_explorer_server.py"]
91+
}}
92+
}}
93+
]
94+
}}
95+
```
96+
97+
- Translation:
98+
```json
99+
{{
100+
"servers": [
101+
{{
102+
"name": "File Explorer",
103+
"transport": {{
104+
"type": "stdio",
105+
"command": "python",
106+
"args": ["/path/to/file_explorer_server.py"]
107+
}}
108+
}}
109+
]
110+
}}
111+
```
112+
113+
IMPORTANT: Only output the translated texts and nothing else, no need explaination or instruction. The input text is between "=== BEGIN OF TEXT ===" and "=== END OF TEXT ===".
114+
115+
Please translate the following texts to Vietnamese:
116+
117+
=== BEGIN OF TEXT ===
118+
{content}
119+
=== END OF TEXT ===
120+
'''.strip()
121+
122+
auto_translate(
123+
prompt=prompt,
124+
output_lang=output_lang,
125+
)

0 commit comments

Comments
 (0)