@@ -230,40 +230,6 @@ def translate_and_split_lines(self, text):
230230 lines = [line .strip () for line in lines if line .strip () != "" ]
231231 return lines
232232
233- def get_best_result_list (
234- self ,
235- plist_len ,
236- new_str ,
237- sleep_dur ,
238- result_list ,
239- max_retries = 15 ,
240- ):
241- if len (result_list ) == plist_len :
242- return result_list , 0
243-
244- best_result_list = result_list
245- retry_count = 0
246-
247- while retry_count < max_retries and len (result_list ) != plist_len :
248- print (
249- f"bug: { plist_len } -> { len (result_list )} : Number of paragraphs before and after translation" ,
250- )
251- print (f"sleep for { sleep_dur } s and retry { retry_count + 1 } ..." )
252- time .sleep (sleep_dur )
253- retry_count += 1
254- result_list = self .translate_and_split_lines (new_str )
255- if (
256- len (result_list ) == plist_len
257- or len (best_result_list ) < len (result_list ) <= plist_len
258- or (
259- len (result_list ) < len (best_result_list )
260- and len (best_result_list ) > plist_len
261- )
262- ):
263- best_result_list = result_list
264-
265- return best_result_list , retry_count
266-
267233 def log_retry (self , state , retry_count , elapsed_time , log_path = "log/buglog.txt" ):
268234 if retry_count == 0 :
269235 return
@@ -333,48 +299,131 @@ def join_lines(self, text):
333299 return new_text
334300
335301 def translate_list (self , plist ):
336- sep = "\n \n \n \n \n "
337- # new_str = sep.join([item.text for item in plist])
302+ plist_len = len (plist )
338303
339- new_str = ""
340- i = 1
341- for p in plist :
304+ # Create a list of original texts and add clear numbering markers to each paragraph
305+ formatted_text = ""
306+ for i , p in enumerate ( plist , 1 ) :
342307 temp_p = copy (p )
343308 for sup in temp_p .find_all ("sup" ):
344309 sup .extract ()
345- new_str += f"({ i } ) { temp_p .get_text ().strip ()} { sep } "
346- i = i + 1
347-
348- if new_str .endswith (sep ):
349- new_str = new_str [: - len (sep )]
350-
351- new_str = self .join_lines (new_str )
352-
353- plist_len = len (plist )
310+ para_text = temp_p .get_text ().strip ()
311+ # Using special delimiters and clear numbering
312+ formatted_text += f"PARAGRAPH { i } :\n { para_text } \n \n "
313+
314+ print (f"plist len = { plist_len } " )
315+
316+ original_prompt_template = self .prompt_template
317+
318+ structured_prompt = (
319+ f"Translate the following { plist_len } paragraphs to {{language}}. "
320+ f"CRUCIAL INSTRUCTION: Format your response using EXACTLY this structure:\n \n "
321+ f"TRANSLATION OF PARAGRAPH 1:\n [Your translation of paragraph 1 here]\n \n "
322+ f"TRANSLATION OF PARAGRAPH 2:\n [Your translation of paragraph 2 here]\n \n "
323+ f"... and so on for all { plist_len } paragraphs.\n \n "
324+ f"You MUST provide EXACTLY { plist_len } translated paragraphs. "
325+ f"Do not merge, split, or rearrange paragraphs. "
326+ f"Translate each paragraph independently but consistently. "
327+ f"Keep all numbers and special formatting in your translation. "
328+ f"Each original paragraph must correspond to exactly one translated paragraph."
329+ )
354330
355- print ( f"plist len = { len ( plist ) } " )
331+ self . prompt_template = structured_prompt + " ```{text}```"
356332
357- result_list = self .translate_and_split_lines ( new_str )
333+ translated_text = self .translate ( formatted_text , False )
358334
359- start_time = time .time ()
335+ # Extract translations from structured output
336+ translated_paragraphs = []
337+ for i in range (1 , plist_len + 1 ):
338+ pattern = (
339+ r"TRANSLATION OF PARAGRAPH "
340+ + str (i )
341+ + r":(.*?)(?=TRANSLATION OF PARAGRAPH \d+:|\Z)"
342+ )
343+ matches = re .findall (pattern , translated_text , re .DOTALL )
360344
361- result_list , retry_count = self .get_best_result_list (
362- plist_len ,
363- new_str ,
364- 6 , # WTF this magic number here?
365- result_list ,
366- )
345+ if matches :
346+ translated_paragraph = matches [0 ].strip ()
347+ translated_paragraphs .append (translated_paragraph )
348+ else :
349+ print (f"Warning: Could not find translation for paragraph { i } " )
350+ loose_pattern = (
351+ r"(?:TRANSLATION|PARAGRAPH|PARA).*?"
352+ + str (i )
353+ + r".*?:(.*?)(?=(?:TRANSLATION|PARAGRAPH|PARA).*?\d+.*?:|\Z)"
354+ )
355+ loose_matches = re .findall (loose_pattern , translated_text , re .DOTALL )
356+ if loose_matches :
357+ translated_paragraphs .append (loose_matches [0 ].strip ())
358+ else :
359+ translated_paragraphs .append ("" )
367360
368- end_time = time . time ()
361+ self . prompt_template = original_prompt_template
369362
370- state = "fail" if len (result_list ) != plist_len else "success"
371- log_path = "log/buglog.txt"
363+ # If the number of extracted paragraphs is incorrect, try the alternative extraction method.
364+ if len (translated_paragraphs ) != plist_len :
365+ print (
366+ f"Warning: Extracted { len (translated_paragraphs )} /{ plist_len } paragraphs. Using fallback extraction."
367+ )
372368
373- self .log_retry (state , retry_count , end_time - start_time , log_path )
374- self .log_translation_mismatch (plist_len , result_list , new_str , sep , log_path )
369+ all_para_pattern = r"(?:TRANSLATION|PARAGRAPH|PARA).*?(\d+).*?:(.*?)(?=(?:TRANSLATION|PARAGRAPH|PARA).*?\d+.*?:|\Z)"
370+ all_matches = re .findall (all_para_pattern , translated_text , re .DOTALL )
371+
372+ if all_matches :
373+ # Create a dictionary to map translation content based on paragraph numbers
374+ para_dict = {}
375+ for num_str , content in all_matches :
376+ try :
377+ num = int (num_str )
378+ if 1 <= num <= plist_len :
379+ para_dict [num ] = content .strip ()
380+ except ValueError :
381+ continue
382+
383+ # Rebuild the translation list in the original order
384+ new_translated_paragraphs = []
385+ for i in range (1 , plist_len + 1 ):
386+ if i in para_dict :
387+ new_translated_paragraphs .append (para_dict [i ])
388+ else :
389+ new_translated_paragraphs .append ("" )
390+
391+ if len (new_translated_paragraphs ) == plist_len :
392+ translated_paragraphs = new_translated_paragraphs
393+
394+ if len (translated_paragraphs ) < plist_len :
395+ translated_paragraphs .extend (
396+ ["" ] * (plist_len - len (translated_paragraphs ))
397+ )
398+ elif len (translated_paragraphs ) > plist_len :
399+ translated_paragraphs = translated_paragraphs [:plist_len ]
400+
401+ return translated_paragraphs
402+
403+ def extract_paragraphs (self , text , paragraph_count ):
404+ """Extract paragraphs from translated text, ensuring paragraph count is preserved."""
405+ # First try to extract by paragraph numbers (1), (2), etc.
406+ result_list = []
407+ for i in range (1 , paragraph_count + 1 ):
408+ pattern = rf"\({ i } \)\s*(.*?)(?=\s*\({ i + 1 } \)|\Z)"
409+ match = re .search (pattern , text , re .DOTALL )
410+ if match :
411+ result_list .append (match .group (1 ).strip ())
412+
413+ # If exact pattern matching failed, try another approach
414+ if len (result_list ) != paragraph_count :
415+ pattern = r"\((\d+)\)\s*(.*?)(?=\s*\(\d+\)|\Z)"
416+ matches = re .findall (pattern , text , re .DOTALL )
417+ if matches :
418+ # Sort by paragraph number
419+ matches .sort (key = lambda x : int (x [0 ]))
420+ result_list = [match [1 ].strip () for match in matches ]
421+
422+ # Fallback to original line-splitting approach
423+ if len (result_list ) != paragraph_count :
424+ lines = text .splitlines ()
425+ result_list = [line .strip () for line in lines if line .strip () != "" ]
375426
376- # del (num), num. sometime (num) will translated to num.
377- result_list = [re .sub (r"^(\(\d+\)|\d+\.|(\d+))\s*" , "" , s ) for s in result_list ]
378427 return result_list
379428
380429 def set_deployment_id (self , deployment_id ):
0 commit comments