@@ -67,9 +67,17 @@ def detect_lang(text):
6767 try :
6868 if not text or not isinstance (text , str ):
6969 return "en"
70+ cleaned_text = text
71+ # remove role and timestamp
72+ cleaned_text = re .sub (r'\b(user|assistant|query|answer)\s*:' , '' , cleaned_text , flags = re .IGNORECASE )
73+ cleaned_text = re .sub (r'\[[\d\-:\s]+\]' , '' , cleaned_text )
74+
75+ # extract chinese characters
7076 chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]"
71- chinese_chars = re .findall (chinese_pattern , text )
72- if len (chinese_chars ) / len (re .sub (r"[\s\d\W]" , "" , text )) > 0.3 :
77+ chinese_chars = re .findall (chinese_pattern , cleaned_text )
78+ text_without_special = re .sub (r"[\s\d\W]" , "" , cleaned_text )
79+ print (text_without_special )
80+ if text_without_special and len (chinese_chars ) / len (text_without_special ) > 0.3 :
7381 return "zh"
7482 return "en"
7583 except Exception :
@@ -466,15 +474,11 @@ def get_scene_data_info(self, scene_data: list, type: str) -> list[str]:
466474 if type == "chat" :
467475 for items in scene_data :
468476 result = []
469- for item in items :
470- # Convert dictionary to string
471- if "chat_time" in item :
472- result .append (item )
473- else :
474- result .append (item )
477+ for i , item in enumerate (items ):
478+ result .append (item )
475479 if len (result ) >= 10 :
476480 results .append (result )
477- context = copy .deepcopy (result [- 2 :])
481+ context = copy .deepcopy (result [- 2 :]) if i + 1 < len ( items ) else []
478482 result = context
479483 if result :
480484 results .append (result )
0 commit comments