Skip to content

Commit 4b81874

Browse files
committed
feat: Knowledge base import supports zip, xls, xlsx, and csv formats, while knowledge base export supports zip format
1 parent 982a419 commit 4b81874

File tree

18 files changed

+805
-36
lines changed

18 files changed

+805
-36
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: csv_parse_qa_handle.py
6+
@date:2024/5/21 14:59
7+
@desc:
8+
"""
9+
import csv
10+
import io
11+
from typing import List
12+
13+
from charset_normalizer import detect
14+
15+
from common.handle.base_split_handle import BaseSplitHandle
16+
17+
18+
def post_cell(cell_value):
19+
return cell_value.replace('\n', '<br>').replace('|', '&#124;')
20+
21+
22+
def row_to_md(row):
23+
return '| ' + ' | '.join(
24+
[post_cell(cell) if cell is not None else '' for cell in row]) + ' |\n'
25+
26+
27+
class CsvSplitHandle(BaseSplitHandle):
28+
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
29+
buffer = get_buffer(file)
30+
paragraphs = []
31+
result = {'name': file.name, 'content': paragraphs}
32+
try:
33+
reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
34+
try:
35+
title_row_list = reader.__next__()
36+
title_md_content = row_to_md(title_row_list)
37+
title_md_content += '| ' + ' | '.join(
38+
['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
39+
except Exception as e:
40+
return result
41+
if len(title_row_list) == 0:
42+
return result
43+
result_item_content = ''
44+
for row in reader:
45+
next_md_content = row_to_md(row)
46+
next_md_content_len = len(next_md_content)
47+
result_item_content_len = len(result_item_content)
48+
if len(result_item_content) == 0:
49+
result_item_content += title_md_content
50+
result_item_content += next_md_content
51+
else:
52+
if result_item_content_len + next_md_content_len < limit:
53+
result_item_content += next_md_content
54+
else:
55+
paragraphs.append({'content': result_item_content, 'title': ''})
56+
result_item_content = ''
57+
if len(result_item_content) > 0:
58+
paragraphs.append({'content': result_item_content, 'title': ''})
59+
return result
60+
except Exception as e:
61+
return result
62+
63+
def get_content(self, file, save_image):
64+
pass
65+
66+
def support(self, file, get_buffer):
67+
file_name: str = file.name.lower()
68+
if file_name.endswith(".csv"):
69+
return True
70+
return False
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: text_split_handle.py
6+
@date:2024/3/27 18:19
7+
@desc:
8+
"""
9+
import io
10+
import os
11+
import re
12+
import uuid
13+
import zipfile
14+
from typing import List
15+
from urllib.parse import urljoin
16+
17+
from django.db.models import QuerySet
18+
19+
from common.handle.base_parse_qa_handle import BaseParseQAHandle
20+
from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
21+
from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
22+
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
23+
from common.util.common import parse_md_image
24+
from dataset.models import Image
25+
26+
27+
class FileBufferHandle:
28+
buffer = None
29+
30+
def get_buffer(self, file):
31+
if self.buffer is None:
32+
self.buffer = file.read()
33+
return self.buffer
34+
35+
36+
split_handles = [XlsParseQAHandle(), XlsxParseQAHandle(), CsvParseQAHandle()]
37+
38+
39+
def save_inner_image(image_list):
40+
"""
41+
子模块插入图片逻辑
42+
@param image_list:
43+
@return:
44+
"""
45+
if image_list is not None and len(image_list) > 0:
46+
QuerySet(Image).bulk_create(image_list)
47+
48+
49+
def file_to_paragraph(file):
50+
"""
51+
文件转换为段落列表
52+
@param file: 文件
53+
@return: {
54+
name:文件名
55+
paragraphs:段落列表
56+
}
57+
"""
58+
get_buffer = FileBufferHandle().get_buffer
59+
for split_handle in split_handles:
60+
if split_handle.support(file, get_buffer):
61+
return split_handle.handle(file, get_buffer, save_inner_image)
62+
raise Exception("不支持的文件格式")
63+
64+
65+
def is_valid_uuid(uuid_str: str):
66+
"""
67+
校验字符串是否是uuid
68+
@param uuid_str: 需要校验的字符串
69+
@return: bool
70+
"""
71+
try:
72+
uuid.UUID(uuid_str)
73+
except ValueError:
74+
return False
75+
return True
76+
77+
78+
def get_image_list(result_list: list, zip_files: List[str]):
79+
"""
80+
获取图片文件列表
81+
@param result_list:
82+
@param zip_files:
83+
@return:
84+
"""
85+
image_file_list = []
86+
for result in result_list:
87+
for p in result.get('paragraphs', []):
88+
content: str = p.get('content', '')
89+
image_list = parse_md_image(content)
90+
for image in image_list:
91+
search = re.search("\(.*\)", image)
92+
if search:
93+
new_image_id = str(uuid.uuid1())
94+
source_image_path = search.group().replace('(', '').replace(')', '')
95+
image_path = urljoin(result.get('name'), '.' + source_image_path if source_image_path.startswith(
96+
'/') else source_image_path)
97+
if not zip_files.__contains__(image_path):
98+
continue
99+
if image_path.startswith('api/file/') or image_path.startswith('api/image/'):
100+
image_id = image_path.replace('api/file/', '').replace('api/image/', '')
101+
if is_valid_uuid(image_id):
102+
image_file_list.append({'source_file': image_path,
103+
'image_id': image_id})
104+
else:
105+
image_file_list.append({'source_file': image_path,
106+
'image_id': new_image_id})
107+
content = content.replace(source_image_path, f'/api/image/{new_image_id}')
108+
p['content'] = content
109+
else:
110+
image_file_list.append({'source_file': image_path,
111+
'image_id': new_image_id})
112+
content = content.replace(source_image_path, f'/api/image/{new_image_id}')
113+
p['content'] = content
114+
115+
return image_file_list
116+
117+
118+
def filter_image_file(result_list: list, image_list):
119+
image_source_file_list = [image.get('source_file') for image in image_list]
120+
return [r for r in result_list if not image_source_file_list.__contains__(r.get('name', ''))]
121+
122+
123+
class ZipParseQAHandle(BaseParseQAHandle):
124+
125+
def handle(self, file, get_buffer, save_image):
126+
buffer = get_buffer(file)
127+
bytes_io = io.BytesIO(buffer)
128+
result = []
129+
# 打开zip文件
130+
with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
131+
# 获取压缩包中的文件名列表
132+
files = zip_ref.namelist()
133+
# 读取压缩包中的文件内容
134+
for file in files:
135+
if file.endswith('/'):
136+
continue
137+
with zip_ref.open(file) as f:
138+
# 对文件内容进行处理
139+
try:
140+
value = file_to_paragraph(f)
141+
if isinstance(value, list):
142+
result = [*result, *value]
143+
else:
144+
result.append(value)
145+
except Exception:
146+
pass
147+
image_list = get_image_list(result, files)
148+
result = filter_image_file(result, image_list)
149+
image_mode_list = []
150+
for image in image_list:
151+
with zip_ref.open(image.get('source_file')) as f:
152+
i = Image(id=image.get('image_id'), image=f.read(),
153+
image_name=os.path.basename(image.get('source_file')))
154+
image_mode_list.append(i)
155+
save_image(image_mode_list)
156+
return result
157+
158+
def support(self, file, get_buffer):
159+
file_name: str = file.name.lower()
160+
if file_name.endswith(".zip") or file_name.endswith(".ZIP"):
161+
return True
162+
return False
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: xls_parse_qa_handle.py
6+
@date:2024/5/21 14:59
7+
@desc:
8+
"""
9+
from typing import List
10+
11+
import xlrd
12+
13+
from common.handle.base_split_handle import BaseSplitHandle
14+
15+
16+
def post_cell(cell_value):
17+
return cell_value.replace('\n', '<br>').replace('|', '&#124;')
18+
19+
20+
def row_to_md(row):
21+
return '| ' + ' | '.join(
22+
[post_cell(str(cell)) if cell is not None else '' for cell in row]) + ' |\n'
23+
24+
25+
def handle_sheet(file_name, sheet, limit: int):
26+
rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
27+
paragraphs = []
28+
result = {'name': file_name, 'content': paragraphs}
29+
try:
30+
title_row_list = next(rows)
31+
title_md_content = row_to_md(title_row_list)
32+
title_md_content += '| ' + ' | '.join(
33+
['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
34+
except Exception as e:
35+
return result
36+
if len(title_row_list) == 0:
37+
return result
38+
result_item_content = ''
39+
for row in rows:
40+
next_md_content = row_to_md(row)
41+
next_md_content_len = len(next_md_content)
42+
result_item_content_len = len(result_item_content)
43+
if len(result_item_content) == 0:
44+
result_item_content += title_md_content
45+
result_item_content += next_md_content
46+
else:
47+
if result_item_content_len + next_md_content_len < limit:
48+
result_item_content += next_md_content
49+
else:
50+
paragraphs.append({'content': result_item_content, 'title': ''})
51+
result_item_content = ''
52+
if len(result_item_content) > 0:
53+
paragraphs.append({'content': result_item_content, 'title': ''})
54+
return result
55+
56+
57+
class XlsSplitHandle(BaseSplitHandle):
58+
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
59+
buffer = get_buffer(file)
60+
try:
61+
workbook = xlrd.open_workbook(file_contents=buffer)
62+
worksheets = workbook.sheets()
63+
worksheets_size = len(worksheets)
64+
return [row for row in
65+
[handle_sheet(file.name,
66+
sheet, limit) if worksheets_size == 1 and sheet.name == 'Sheet1' else handle_sheet(
67+
sheet.name, sheet, limit) for sheet
68+
in worksheets] if row is not None]
69+
except Exception as e:
70+
return [{'name': file.name, 'content': []}]
71+
72+
def get_content(self, file, save_image):
73+
pass
74+
75+
def support(self, file, get_buffer):
76+
file_name: str = file.name.lower()
77+
buffer = get_buffer(file)
78+
if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
79+
return True
80+
return False

0 commit comments

Comments
 (0)