@@ -67,9 +67,18 @@ def detect_lang(text):
6767 try :
6868 if not text or not isinstance (text , str ):
6969 return "en"
70+ cleaned_text = text
71+ # remove role and timestamp
72+ cleaned_text = re .sub (
73+ r"\b(user|assistant|query|answer)\s*:" , "" , cleaned_text , flags = re .IGNORECASE
74+ )
75+ cleaned_text = re .sub (r"\[[\d\-:\s]+\]" , "" , cleaned_text )
76+
77+ # extract chinese characters
7078 chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]"
71- chinese_chars = re .findall (chinese_pattern , text )
72- if len (chinese_chars ) / len (re .sub (r"[\s\d\W]" , "" , text )) > 0.3 :
79+ chinese_chars = re .findall (chinese_pattern , cleaned_text )
80+ text_without_special = re .sub (r"[\s\d\W]" , "" , cleaned_text )
81+ if text_without_special and len (chinese_chars ) / len (text_without_special ) > 0.3 :
7382 return "zh"
7483 return "en"
7584 except Exception :
@@ -466,15 +475,11 @@ def get_scene_data_info(self, scene_data: list, type: str) -> list[str]:
466475 if type == "chat" :
467476 for items in scene_data :
468477 result = []
469- for item in items :
470- # Convert dictionary to string
471- if "chat_time" in item :
472- result .append (item )
473- else :
474- result .append (item )
478+ for i , item in enumerate (items ):
479+ result .append (item )
475480 if len (result ) >= 10 :
476481 results .append (result )
477- context = copy .deepcopy (result [- 2 :])
482+ context = copy .deepcopy (result [- 2 :]) if i + 1 < len ( items ) else []
478483 result = context
479484 if result :
480485 results .append (result )
0 commit comments