Skip to content

Commit 582fb99

Browse files
committed
feat: add Markdown parsing support for QA handling
1 parent ee53932 commit 582fb99

File tree

3 files changed

+123
-5
lines changed

3 files changed

+123
-5
lines changed

apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from application.flow.i_step_node import NodeResult
99
from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
1010
from common.chunk import text_to_chunk
11-
from knowledge.serializers.document import default_split_handle, FileBufferHandle
11+
from knowledge.serializers.document import default_split_handle, FileBufferHandle, md_qa_split_handle
1212

1313

1414
def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
@@ -65,7 +65,11 @@ def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_r
6565
get_buffer = FileBufferHandle().get_buffer
6666

6767
file_mem = bytes_to_uploaded_file(doc['content'].encode('utf-8'))
68-
result = default_split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
68+
if split_strategy == 'qa':
69+
result = md_qa_split_handle.handle(file_mem, get_buffer, self._save_image)
70+
else:
71+
result = default_split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer,
72+
self._save_image)
6973
# 统一处理结果为列表
7074
results = result if isinstance(result, list) else [result]
7175

@@ -102,7 +106,7 @@ def _process_split_result(
102106
}
103107
item['name'] = file_name
104108
item['source_file_id'] = source_file_id
105-
item['paragraphs'] = item.pop('content', [])
109+
item['paragraphs'] = item.pop('content', item.get('paragraphs', []))
106110

107111
for paragraph in item['paragraphs']:
108112
paragraph['problem_list'] = self._generate_problem_list(
@@ -126,7 +130,11 @@ def _generate_problem_list(
126130
if document_name_relate_problem_type == 'referencing':
127131
document_name_relate_problem = self.get_reference_content(document_name_relate_problem_reference)
128132

129-
problem_list = []
133+
problem_list = [
134+
item for p in paragraph.get('problem_list', []) for item in p.get('content', '').split('<br>')
135+
if item.strip()
136+
]
137+
130138
if split_strategy == 'auto':
131139
if paragraph_title_relate_problem and paragraph.get('title'):
132140
problem_list.append(paragraph.get('title'))
@@ -141,7 +149,7 @@ def _generate_problem_list(
141149
if document_name_relate_problem and document_name:
142150
problem_list.append(document_name)
143151

144-
return problem_list
152+
return list(set(problem_list))
145153

146154
def get_details(self, index: int, **kwargs):
147155
return {
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: md_parse_qa_handle.py
6+
@date:2024/5/21 14:59
7+
@desc:
8+
"""
9+
import re
10+
import traceback
11+
12+
from charset_normalizer import detect
13+
14+
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
15+
from common.utils.logger import maxkb_logger
16+
17+
18+
class MarkdownParseQAHandle(BaseParseQAHandle):
19+
def support(self, file, get_buffer):
20+
file_name: str = file.name.lower()
21+
if file_name.endswith(".md") or file_name.endswith(".markdown"):
22+
return True
23+
return False
24+
25+
def parse_markdown_table(self, content):
26+
"""解析 Markdown 表格,返回表格数据列表"""
27+
tables = []
28+
lines = content.split('\n')
29+
i = 0
30+
31+
while i < len(lines):
32+
line = lines[i].strip()
33+
# 检测表格开始(包含 | 符号)
34+
if '|' in line and line.startswith('|'):
35+
table_data = []
36+
# 读取表头
37+
header = [cell.strip() for cell in line.split('|')[1:-1]]
38+
table_data.append(header)
39+
i += 1
40+
41+
# 跳过分隔行 (例如: | --- | --- |)
42+
if i < len(lines) and re.match(r'\s*\|[\s\-:]+\|\s*', lines[i]):
43+
i += 1
44+
45+
# 读取数据行
46+
while i < len(lines):
47+
line = lines[i].strip()
48+
if not line.startswith('|'):
49+
break
50+
row = [cell.strip() for cell in line.split('|')[1:-1]]
51+
if len(row) > 0:
52+
table_data.append(row)
53+
i += 1
54+
55+
if len(table_data) > 1: # 至少有表头和一行数据
56+
tables.append(table_data)
57+
else:
58+
i += 1
59+
60+
return tables
61+
62+
def handle(self, file, get_buffer, save_image):
63+
buffer = get_buffer(file)
64+
try:
65+
# 检测编码并读取文件内容
66+
encoding = detect(buffer)['encoding']
67+
content = buffer.decode(encoding if encoding else 'utf-8')
68+
69+
# 解析 Markdown 表格
70+
tables = self.parse_markdown_table(content)
71+
72+
if not tables:
73+
return [{'name': file.name, 'paragraphs': []}]
74+
75+
paragraph_list = []
76+
77+
# 处理每个表格
78+
for table in tables:
79+
if len(table) < 2:
80+
continue
81+
82+
title_row_list = table[0]
83+
title_row_index_dict = get_title_row_index_dict(title_row_list)
84+
85+
# 处理表格的每一行数据
86+
for row in table[1:]:
87+
content = get_row_value(row, title_row_index_dict, 'content')
88+
if content is None:
89+
continue
90+
91+
problem = get_row_value(row, title_row_index_dict, 'problem_list')
92+
problem = str(problem) if problem is not None else ''
93+
problem_list = [{'content': p[0:255]} for p in problem.split('\n') if len(p.strip()) > 0]
94+
95+
title = get_row_value(row, title_row_index_dict, 'title')
96+
title = str(title) if title is not None else ''
97+
98+
paragraph_list.append({
99+
'title': title[0:255],
100+
'content': content[0:102400],
101+
'problem_list': problem_list
102+
})
103+
104+
return [{'name': file.name, 'paragraphs': paragraph_list}]
105+
106+
except Exception as e:
107+
maxkb_logger.error(f"Error processing Markdown file {file.name}: {e}, {traceback.format_exc()}")
108+
return [{'name': file.name, 'paragraphs': []}]

apps/knowledge/serializers/document.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from common.exception.app_exception import AppApiException
3131
from common.field.common import UploadedFileField
3232
from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
33+
from common.handle.impl.qa.md_parse_qa_handle import MarkdownParseQAHandle
3334
from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
3435
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
3536
from common.handle.impl.qa.zip_parse_qa_handle import ZipParseQAHandle
@@ -75,6 +76,7 @@
7576
default_split_handle
7677
]
7778

79+
md_qa_split_handle = MarkdownParseQAHandle()
7880
parse_qa_handle_list = [XlsParseQAHandle(), CsvParseQAHandle(), XlsxParseQAHandle(), ZipParseQAHandle()]
7981
parse_table_handle_list = [CsvParseTableHandle(), XlsParseTableHandle(), XlsxParseTableHandle()]
8082

0 commit comments

Comments
 (0)