Skip to content

Commit fb7abb4

Browse files
authored
Pr@main@fix bugs (#41)
* fix: 修复提示问题 * fix: 上传文档限制 * feat: 问题管理 * fix: 修改分段正则,优化分段逻辑 * feat: 问题管理 * fix: word分段支持表格数据 * fix: 问题批量插入去重 * fix: 修复文档问题 * feat: 文档分页优化 * fix: 优化关联问题 * fix: 嵌入样式
1 parent e9d85b0 commit fb7abb4

File tree

23 files changed

+203
-66
lines changed

23 files changed

+203
-66
lines changed

apps/application/template/embed.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ const guideHtml=`
1919
</div>
2020
`
2121
const chatButtonHtml=
22-
`<div class="maxkb-chat-button" ><svg width="48" height="56" viewBox="0 0 48 56" fill="none" xmlns="http://www.w3.org/2000/svg">
22+
`<div class="maxkb-chat-button" ><svg style="vertical-align: middle;overflow: hidden;" width="48" height="56" viewBox="0 0 48 56" fill="none" xmlns="http://www.w3.org/2000/svg">
2323
<g filter="url(#filter0_d_349_49711)">
2424
<path d="M8 24C8 12.9543 16.9543 4 28 4H48V44H28C16.9543 44 8 35.0457 8 24Z" fill="url(#paint0_linear_349_49711)"/>
2525
</g>
@@ -164,7 +164,7 @@ function initMaxkbStyle(root){
164164
}
165165
#maxkb .maxkb-mask .maxkb-content {
166166
width: 45px;
167-
height: 50px;
167+
height: 48px;
168168
box-shadow: 1px 1px 1px 2000px rgba(0,0,0,.6);
169169
border-radius: 50% 0 0 50%;
170170
position: absolute;

apps/common/handle/impl/doc_split_handle.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,18 @@
1111
from typing import List
1212

1313
from docx import Document
14+
from docx.table import Table
15+
from docx.text.paragraph import Paragraph
1416

1517
from common.handle.base_split_handle import BaseSplitHandle
1618
from common.util.split_model import SplitModel
1719

18-
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
19-
re.compile("(?<!#)### (?!#).*"),
20-
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
21-
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
20+
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
21+
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
22+
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
23+
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
24+
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
25+
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
2226

2327

2428
class DocSplitHandle(BaseSplitHandle):
@@ -32,9 +36,31 @@ def paragraph_to_md(paragraph):
3236
return paragraph.text
3337
return paragraph.text
3438

39+
@staticmethod
40+
def table_to_md(table):
41+
rows = table.rows
42+
# 创建 Markdown 格式的表格
43+
md_table = '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in rows[0].cells]) + ' |\n'
44+
md_table += '| ' + ' | '.join(['---' for i in range(len(rows[0].cells))]) + ' |\n'
45+
for row in rows[1:]:
46+
md_table += '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in row.cells]) + ' |\n'
47+
return md_table
48+
3549
def to_md(self, doc):
36-
ps = doc.paragraphs
37-
return "\n".join([self.paragraph_to_md(para) for para in ps])
50+
elements = []
51+
for element in doc.element.body:
52+
if element.tag.endswith('tbl'):
53+
# 处理表格
54+
table = Table(element, doc)
55+
elements.append(table)
56+
elif element.tag.endswith('p'):
57+
# 处理段落
58+
paragraph = Paragraph(element, doc)
59+
elements.append(paragraph)
60+
61+
return "\n".join(
62+
[self.paragraph_to_md(element) if isinstance(element, Paragraph) else self.table_to_md(element) for element
63+
in elements])
3864

3965
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
4066
try:

apps/common/handle/impl/pdf_split_handle.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,13 @@
1414
from common.handle.base_split_handle import BaseSplitHandle
1515
from common.util.split_model import SplitModel
1616

17-
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
18-
re.compile("(?<!#)### (?!#).*"),
19-
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
20-
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
17+
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
18+
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
19+
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
20+
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
21+
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
22+
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"),
23+
re.compile("(?<!\n)\n\n+")]
2124

2225

2326
def number_to_text(pdf_document, page_number):

apps/common/handle/impl/text_split_handle.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
from common.handle.base_split_handle import BaseSplitHandle
1515
from common.util.split_model import SplitModel
1616

17-
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
18-
re.compile("(?<!#)### (?!#).*"),
19-
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
20-
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
17+
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
18+
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
19+
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
20+
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
21+
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
22+
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
2123

2224

2325
class TextSplitHandle(BaseSplitHandle):

apps/common/util/split_model.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ def parse_to_tree(self, text: str, index=0):
295295
"""
296296
if len(self.content_level_pattern) == index:
297297
return
298-
level_content_list = parse_title_level(text, self.content_level_pattern, index)
298+
level_content_list = parse_title_level(text, self.content_level_pattern, 0)
299299
cursor = 0
300300
for i in range(len(level_content_list)):
301301
block, cursor = get_level_block(text, level_content_list, i, cursor)
@@ -313,10 +313,15 @@ def parse_to_tree(self, text: str, index=0):
313313
if end_index == 0:
314314
return level_content_list
315315
other_content = text[0:end_index]
316-
if len(other_content.strip()) > 0:
317-
level_content_list = [*level_content_list, *list(
318-
map(lambda row: to_tree_obj(row, 'block'),
319-
post_handler_paragraph(other_content, with_filter=self.with_filter, limit=self.limit)))]
316+
children = self.parse_to_tree(text=other_content,
317+
index=index)
318+
if len(children) > 0:
319+
level_content_list = [*level_content_list, *children]
320+
else:
321+
if len(other_content.strip()) > 0:
322+
level_content_list = [*level_content_list, *list(
323+
map(lambda row: to_tree_obj(row, 'block'),
324+
post_handler_paragraph(other_content, with_filter=self.with_filter, limit=self.limit)))]
320325
else:
321326
if len(text.strip()) > 0:
322327
level_content_list = [*level_content_list, *list(
@@ -330,15 +335,16 @@ def parse(self, text: str):
330335
:param text: 文本数据
331336
:return: 解析后数据 {content:段落数据,keywords:[‘段落关键词’],parent_chain:['段落父级链路']}
332337
"""
333-
result_tree = self.parse_to_tree(text.replace('\r', '\n'), 0)
338+
text = text.replace('\r', '\n')
339+
result_tree = self.parse_to_tree(text, 0)
334340
result = result_tree_to_paragraph(result_tree, [], [])
335-
# 过滤段落内容不为空字符串的数据
336-
result = [item for item in result if 'content' in item and len(item.get('content').strip()) > 0]
337-
return [self.post_reset_paragraph(item) for item in result]
341+
return [item for item in [self.post_reset_paragraph(row) for row in result] if
342+
'content' in item and len(item.get('content').strip()) > 0]
338343

339344
def post_reset_paragraph(self, paragraph: Dict):
340345
result = self.filter_title_special_characters(paragraph)
341346
result = self.sub_title(result)
347+
result = self.content_is_null(result)
342348
return result
343349

344350
@staticmethod
@@ -349,6 +355,15 @@ def sub_title(paragraph: Dict):
349355
return {**paragraph, 'title': title[0:255], 'content': title[255:len(title)] + paragraph.get('content')}
350356
return paragraph
351357

358+
@staticmethod
359+
def content_is_null(paragraph: Dict):
360+
if 'title' in paragraph:
361+
title = paragraph.get('title')
362+
content = paragraph.get('content')
363+
if (content is None or len(content.strip()) == 0) and (title is not None and len(title) > 0):
364+
return {'title': '', 'content': title}
365+
return paragraph
366+
352367
@staticmethod
353368
def filter_title_special_characters(paragraph: Dict):
354369
title = paragraph.get('title') if 'title' in paragraph else ''
@@ -361,9 +376,12 @@ def filter_title_special_characters(paragraph: Dict):
361376
title_special_characters_list = ['#', '\n', '\r', '\\s']
362377

363378
default_split_pattern = {
364-
'md': [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'), re.compile("(?<!#)### (?!#).*"),
365-
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
366-
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")],
379+
'md': [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
380+
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
381+
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
382+
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
383+
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
384+
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")],
367385
'default': [re.compile("(?<!\n)\n\n+")]
368386
}
369387

apps/dataset/serializers/document_serializers.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -506,10 +506,12 @@ def parse(self):
506506
class SplitPattern(ApiMixin, serializers.Serializer):
507507
@staticmethod
508508
def list():
509-
return [{'key': "#", 'value': '(?<=^)# .*|(?<=\\n)# .*'}, {'key': '##', 'value': '(?<!#)## (?!#).*'},
510-
{'key': '###', 'value': "(?<!#)### (?!#).*"}, {'key': '####', 'value': "(?<!#)#### (?!#).*"},
511-
{'key': '#####', 'value': "(?<!#)##### (?!#).*"},
512-
{'key': '######', 'value': "(?<!#)###### (?!#).*"},
509+
return [{'key': "#", 'value': '(?<=^)# .*|(?<=\\n)# .*'},
510+
{'key': '##', 'value': '(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'},
511+
{'key': '###', 'value': "(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"},
512+
{'key': '####', 'value': "(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"},
513+
{'key': '#####', 'value': "(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"},
514+
{'key': '######', 'value': "(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"},
513515
{'key': '-', 'value': '(?<! )- .*'},
514516
{'key': '空格', 'value': '(?<!\\s)\\s(?!\\s)'},
515517
{'key': '分号', 'value': '(?<!;);(?!;)'}, {'key': '逗号', 'value': '(?<!,),(?!,)'},

apps/dataset/serializers/problem_serializers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,14 @@ def batch(self, with_valid=True):
6060
if with_valid:
6161
self.is_valid(raise_exception=True)
6262
problem_list = self.data.get('problem_list')
63+
problem_list = list(set(problem_list))
6364
dataset_id = self.data.get('dataset_id')
6465
exists_problem_content_list = [problem.content for problem in
6566
QuerySet(Problem).filter(dataset_id=dataset_id,
6667
content__in=problem_list)]
6768
problem_instance_list = [Problem(id=uuid.uuid1(), dataset_id=dataset_id, content=problem_content) for
6869
problem_content in
69-
self.data.get('problem_list') if
70+
problem_list if
7071
(not exists_problem_content_list.__contains__(problem_content) if
7172
len(exists_problem_content_list) > 0 else True)]
7273

@@ -122,7 +123,7 @@ def list_paragraph(self, with_valid=True):
122123
self.is_valid(raise_exception=True)
123124
problem_paragraph_mapping = QuerySet(ProblemParagraphMapping).filter(dataset_id=self.data.get("dataset_id"),
124125
problem_id=self.data.get("problem_id"))
125-
if problem_paragraph_mapping is None or len(problem_paragraph_mapping)==0:
126+
if problem_paragraph_mapping is None or len(problem_paragraph_mapping) == 0:
126127
return []
127128
return native_search(
128129
QuerySet(Paragraph).filter(id__in=[row.paragraph_id for row in problem_paragraph_mapping]),

ui/src/assets/csv-icon.svg

Lines changed: 7 additions & 0 deletions
Loading

ui/src/assets/doc-icon.svg

Lines changed: 5 additions & 0 deletions
Loading

0 commit comments

Comments
 (0)