@@ -214,4 +214,36 @@ def split_and_transcribe(file_path, model, max_segment_length_ms=59000, audio_fo
214214
215215
216216def _remove_empty_lines (text ):
217- return '\n ' .join (line for line in text .split ('\n ' ) if line .strip ())
217+ result = '\n ' .join (line for line in text .split ('\n ' ) if line .strip ())
218+ return markdown_to_plain_text (result )
219+
220+
221+ def markdown_to_plain_text (md : str ) -> str :
222+ # 移除图片 
223+ text = re .sub (r'!\[.*?\]\(.*?\)' , '' , md )
224+ # 移除链接 [text](url)
225+ text = re .sub (r'\[([^\]]+)\]\([^)]+\)' , r'\1' , text )
226+ # 移除 Markdown 标题符号 (#, ##, ###)
227+ text = re .sub (r'^#{1,6}\s+' , '' , text , flags = re .MULTILINE )
228+ # 移除加粗 **text** 或 __text__
229+ text = re .sub (r'\*\*(.*?)\*\*' , r'\1' , text )
230+ text = re .sub (r'__(.*?)__' , r'\1' , text )
231+ # 移除斜体 *text* 或 _text_
232+ text = re .sub (r'\*(.*?)\*' , r'\1' , text )
233+ text = re .sub (r'_(.*?)_' , r'\1' , text )
234+ # 移除行内代码 `code`
235+ text = re .sub (r'`(.*?)`' , r'\1' , text )
236+ # 移除代码块 ```code```
237+ text = re .sub (r'```[\s\S]*?```' , '' , text )
238+ # 移除多余的换行符
239+ text = re .sub (r'\n{2,}' , '\n ' , text )
240+ # 使用正则表达式去除所有 HTML 标签
241+ text = re .sub (r'<[^>]+>' , '' , text )
242+ # 去除多余的空白字符(包括换行符、制表符等)
243+ text = re .sub (r'\s+' , ' ' , text )
244+ # 去除表单渲染
245+ re .sub (r'<form_rander>[\s\S]*?<\/form_rander>' , '' , text )
246+ # 去除首尾空格
247+ text = text .strip ()
248+ return text
249+
0 commit comments