Skip to content

Commit e24a200

Browse files
committed
feat: refine regex patterns in text_split_handle for improved comment detection
--bug=1057526 --user=刘瑞斌 【知识库】markdown文件导入知识库,分段详情中代码块展示异常 https://www.tapd.cn/62980211/s/1719131
1 parent 56fe631 commit e24a200

File tree

1 file changed

+10
-11
lines changed

1 file changed

+10
-11
lines changed

apps/common/handle/impl/text/text_split_handle.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@
1515
from common.handle.base_split_handle import BaseSplitHandle
1616
from common.utils.split_model import SplitModel
1717

18-
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
19-
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
20-
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
21-
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
22-
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
23-
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
18+
default_pattern_list = [
19+
re.compile('(?<=^)# (?!-\\*- coding:).*|(?<=\\n)# (?!-\\*- coding:).*'),
20+
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
21+
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
22+
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
23+
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
24+
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")
25+
]
2426

2527

2628
class TextSplitHandle(BaseSplitHandle):
@@ -45,11 +47,8 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
4547
try:
4648
content = buffer.decode(detect(buffer)['encoding'])
4749
except BaseException as e:
48-
return {'name': file.name,
49-
'content': []}
50-
return {'name': file.name,
51-
'content': split_model.parse(content)
52-
}
50+
return {'name': file.name, 'content': []}
51+
return {'name': file.name, 'content': split_model.parse(content)}
5352

5453
def get_content(self, file, save_image):
5554
buffer = file.read()

0 commit comments

Comments
 (0)