Skip to content

Commit 130ddf4

Browse files
committed
2 parents f4ac496 + 433e208 commit 130ddf4

File tree

1 file changed

+113
-64
lines changed

1 file changed

+113
-64
lines changed

book_maker/translator/chatgptapi_translator.py

Lines changed: 113 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -230,40 +230,6 @@ def translate_and_split_lines(self, text):
230230
lines = [line.strip() for line in lines if line.strip() != ""]
231231
return lines
232232

233-
def get_best_result_list(
234-
self,
235-
plist_len,
236-
new_str,
237-
sleep_dur,
238-
result_list,
239-
max_retries=15,
240-
):
241-
if len(result_list) == plist_len:
242-
return result_list, 0
243-
244-
best_result_list = result_list
245-
retry_count = 0
246-
247-
while retry_count < max_retries and len(result_list) != plist_len:
248-
print(
249-
f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation",
250-
)
251-
print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...")
252-
time.sleep(sleep_dur)
253-
retry_count += 1
254-
result_list = self.translate_and_split_lines(new_str)
255-
if (
256-
len(result_list) == plist_len
257-
or len(best_result_list) < len(result_list) <= plist_len
258-
or (
259-
len(result_list) < len(best_result_list)
260-
and len(best_result_list) > plist_len
261-
)
262-
):
263-
best_result_list = result_list
264-
265-
return best_result_list, retry_count
266-
267233
def log_retry(self, state, retry_count, elapsed_time, log_path="log/buglog.txt"):
268234
if retry_count == 0:
269235
return
@@ -333,48 +299,131 @@ def join_lines(self, text):
333299
return new_text
334300

335301
def translate_list(self, plist):
336-
sep = "\n\n\n\n\n"
337-
# new_str = sep.join([item.text for item in plist])
302+
plist_len = len(plist)
338303

339-
new_str = ""
340-
i = 1
341-
for p in plist:
304+
# Create a list of original texts and add clear numbering markers to each paragraph
305+
formatted_text = ""
306+
for i, p in enumerate(plist, 1):
342307
temp_p = copy(p)
343308
for sup in temp_p.find_all("sup"):
344309
sup.extract()
345-
new_str += f"({i}) {temp_p.get_text().strip()}{sep}"
346-
i = i + 1
347-
348-
if new_str.endswith(sep):
349-
new_str = new_str[: -len(sep)]
350-
351-
new_str = self.join_lines(new_str)
352-
353-
plist_len = len(plist)
310+
para_text = temp_p.get_text().strip()
311+
# Using special delimiters and clear numbering
312+
formatted_text += f"PARAGRAPH {i}:\n{para_text}\n\n"
313+
314+
print(f"plist len = {plist_len}")
315+
316+
original_prompt_template = self.prompt_template
317+
318+
structured_prompt = (
319+
f"Translate the following {plist_len} paragraphs to {{language}}. "
320+
f"CRUCIAL INSTRUCTION: Format your response using EXACTLY this structure:\n\n"
321+
f"TRANSLATION OF PARAGRAPH 1:\n[Your translation of paragraph 1 here]\n\n"
322+
f"TRANSLATION OF PARAGRAPH 2:\n[Your translation of paragraph 2 here]\n\n"
323+
f"... and so on for all {plist_len} paragraphs.\n\n"
324+
f"You MUST provide EXACTLY {plist_len} translated paragraphs. "
325+
f"Do not merge, split, or rearrange paragraphs. "
326+
f"Translate each paragraph independently but consistently. "
327+
f"Keep all numbers and special formatting in your translation. "
328+
f"Each original paragraph must correspond to exactly one translated paragraph."
329+
)
354330

355-
print(f"plist len = {len(plist)}")
331+
self.prompt_template = structured_prompt + " ```{text}```"
356332

357-
result_list = self.translate_and_split_lines(new_str)
333+
translated_text = self.translate(formatted_text, False)
358334

359-
start_time = time.time()
335+
# Extract translations from structured output
336+
translated_paragraphs = []
337+
for i in range(1, plist_len + 1):
338+
pattern = (
339+
r"TRANSLATION OF PARAGRAPH "
340+
+ str(i)
341+
+ r":(.*?)(?=TRANSLATION OF PARAGRAPH \d+:|\Z)"
342+
)
343+
matches = re.findall(pattern, translated_text, re.DOTALL)
360344

361-
result_list, retry_count = self.get_best_result_list(
362-
plist_len,
363-
new_str,
364-
6, # WTF this magic number here?
365-
result_list,
366-
)
345+
if matches:
346+
translated_paragraph = matches[0].strip()
347+
translated_paragraphs.append(translated_paragraph)
348+
else:
349+
print(f"Warning: Could not find translation for paragraph {i}")
350+
loose_pattern = (
351+
r"(?:TRANSLATION|PARAGRAPH|PARA).*?"
352+
+ str(i)
353+
+ r".*?:(.*?)(?=(?:TRANSLATION|PARAGRAPH|PARA).*?\d+.*?:|\Z)"
354+
)
355+
loose_matches = re.findall(loose_pattern, translated_text, re.DOTALL)
356+
if loose_matches:
357+
translated_paragraphs.append(loose_matches[0].strip())
358+
else:
359+
translated_paragraphs.append("")
367360

368-
end_time = time.time()
361+
self.prompt_template = original_prompt_template
369362

370-
state = "fail" if len(result_list) != plist_len else "success"
371-
log_path = "log/buglog.txt"
363+
# If the number of extracted paragraphs is incorrect, try the alternative extraction method.
364+
if len(translated_paragraphs) != plist_len:
365+
print(
366+
f"Warning: Extracted {len(translated_paragraphs)}/{plist_len} paragraphs. Using fallback extraction."
367+
)
372368

373-
self.log_retry(state, retry_count, end_time - start_time, log_path)
374-
self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path)
369+
all_para_pattern = r"(?:TRANSLATION|PARAGRAPH|PARA).*?(\d+).*?:(.*?)(?=(?:TRANSLATION|PARAGRAPH|PARA).*?\d+.*?:|\Z)"
370+
all_matches = re.findall(all_para_pattern, translated_text, re.DOTALL)
371+
372+
if all_matches:
373+
# Create a dictionary to map translation content based on paragraph numbers
374+
para_dict = {}
375+
for num_str, content in all_matches:
376+
try:
377+
num = int(num_str)
378+
if 1 <= num <= plist_len:
379+
para_dict[num] = content.strip()
380+
except ValueError:
381+
continue
382+
383+
# Rebuild the translation list in the original order
384+
new_translated_paragraphs = []
385+
for i in range(1, plist_len + 1):
386+
if i in para_dict:
387+
new_translated_paragraphs.append(para_dict[i])
388+
else:
389+
new_translated_paragraphs.append("")
390+
391+
if len(new_translated_paragraphs) == plist_len:
392+
translated_paragraphs = new_translated_paragraphs
393+
394+
if len(translated_paragraphs) < plist_len:
395+
translated_paragraphs.extend(
396+
[""] * (plist_len - len(translated_paragraphs))
397+
)
398+
elif len(translated_paragraphs) > plist_len:
399+
translated_paragraphs = translated_paragraphs[:plist_len]
400+
401+
return translated_paragraphs
402+
403+
def extract_paragraphs(self, text, paragraph_count):
404+
"""Extract paragraphs from translated text, ensuring paragraph count is preserved."""
405+
# First try to extract by paragraph numbers (1), (2), etc.
406+
result_list = []
407+
for i in range(1, paragraph_count + 1):
408+
pattern = rf"\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)"
409+
match = re.search(pattern, text, re.DOTALL)
410+
if match:
411+
result_list.append(match.group(1).strip())
412+
413+
# If exact pattern matching failed, try another approach
414+
if len(result_list) != paragraph_count:
415+
pattern = r"\((\d+)\)\s*(.*?)(?=\s*\(\d+\)|\Z)"
416+
matches = re.findall(pattern, text, re.DOTALL)
417+
if matches:
418+
# Sort by paragraph number
419+
matches.sort(key=lambda x: int(x[0]))
420+
result_list = [match[1].strip() for match in matches]
421+
422+
# Fallback to original line-splitting approach
423+
if len(result_list) != paragraph_count:
424+
lines = text.splitlines()
425+
result_list = [line.strip() for line in lines if line.strip() != ""]
375426

376-
# del (num), num. sometime (num) will translated to num.
377-
result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list]
378427
return result_list
379428

380429
def set_deployment_id(self, deployment_id):

0 commit comments

Comments
 (0)