Skip to content

Commit b46fbf6

Browse files
committed
#5 finishing
1 parent bab5511 commit b46fbf6

File tree

1 file changed

+56
-5
lines changed

1 file changed

+56
-5
lines changed

translate_seg.py

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"de": "German", "fr": "French", "es": "Spanish", "ja": "Japanese",
1010
"zh": "Chinese(Simplified)", "zh-tw": "Chinese(Traditional)",
1111
"ru": "Russian", "pt": "Portuguese", "ko": "Korean", "hi": "Hindi",
12-
# Added languages
12+
# Not fully checked
1313
"ar": "Arabic", "cs": "Czech", "nl": "Dutch", "en": "English",
1414
"el": "Greek", "he": "Hebrew", "id": "Indonesian", "it": "Italian",
1515
"fa": "Persian", "pl": "Polish", "ro": "Romanian", "tr": "Turkish",
@@ -23,7 +23,6 @@
2323

2424
# 1. Config and Argument Parsing
2525

26-
# NEW: Language-Specific Prompt Injection
2726
lang_guidance = ""
2827
scripts_dir = "scripts"
2928
guidance_file = os.path.join(scripts_dir, f"{args.lang}.txt")
@@ -124,6 +123,48 @@ def fix_relative_paths(text):
124123
"このセクション", "この中で", "このセクションでは", "意味する", "説明する",
125124
# Russian
126125
"Этот раздел", "В этом", "В этом разделе", "означает", "объясняет", "ниже",
126+
127+
# Arabic
128+
"هذا القسم", "في هذا", "في هذا القسم", "يعني", "يشرح",
129+
130+
# Czech
131+
"Tato sekce", "V tomto", "V této sekci", "znamená", "vysvětluje",
132+
133+
# Dutch
134+
"Deze sectie", "In dit", "In deze sectie", "betekent", "verklaart",
135+
136+
# Greek
137+
"Αυτό το τμήμα", "Σε αυτό", "Σε αυτό το τμήμα", "σημαίνει", "εξηγεί",
138+
139+
# Hebrew
140+
"סעיף זה", "בזה", "בסעיף זה", "משמעותו", "מסביר",
141+
142+
# Indonesian
143+
"Bagian ini", "Dalam ini", "Di bagian ini", "berarti", "menjelaskan",
144+
145+
# Italian
146+
"Questa sezione", "In questo", "In questa sezione", "significa", "spiega",
147+
148+
# Persian (Farsi)
149+
"این بخش", "در این", "در این بخش", "معنی می‌دهد", "توضیح می‌دهد",
150+
151+
# Polish
152+
"Ta sekcja", "W tym", "W tej sekcji", "oznacza", "wyjaśnia",
153+
154+
# Romanian
155+
"Această secțiune", "În acest", "În această secțiune", "înseamnă", "explică",
156+
157+
# Turkish
158+
"Bu bölüm", "Bunda", "Bu bölümde", "anlamına gelir", "açıklar",
159+
160+
# Ukrainian
161+
"Цей розділ", "У цьому", "У цьому розділі", "означає", "пояснює",
162+
163+
# Vietnamese
164+
"Phần này", "Trong này", "Trong phần này", "có nghĩa là", "giải thích",
165+
166+
# Traditional Chinese
167+
"以下", "說明", "本節", "在這裡", "意味著", "解釋",
127168
# Portuguese
128169
"Esta seção", "Nesta seção", "significa", "explica",
129170
# Korean
@@ -195,7 +236,7 @@ def main():
195236
{"role": "system", "content": current_system_prompt},
196237
{"role": "user", "content": ctext}
197238
],
198-
temperature=0, # Keeps the model from getting 'creative'
239+
temperature=0,
199240
stream=True
200241
)
201242

@@ -210,8 +251,18 @@ def main():
210251
translated += delta
211252

212253
# Dynamic length check
213-
# Adjust multiplier for Japanese "Nyan" expansion
214-
multiplier = 5.5 if args.lang in ["ja", "hi", "ru"] else 2.5
254+
high_multiplier_map = {
255+
"ja": 5.5, # Japanese can expand a lot
256+
"hi": 5.5, # Hindi often requires more tokens
257+
"ar": 4.0, # Arabic often expands moderately
258+
"he": 4.0, # Hebrew
259+
"fa": 4.0, # Persian (Farsi)
260+
"ru": 3.5, # Russian
261+
"uk": 3.5, # Ukrainian
262+
"pl": 3.5, # Polish
263+
}
264+
265+
multiplier = high_multiplier_map.get(args.lang, 2.5)
215266

216267
if len(translated) > multiplier * len(ctext):
217268
print(f"\n[WARN] Output too long ({len(translated)} chars) — aborting.")

0 commit comments

Comments
 (0)