Skip to content

Commit 48297d8

Browse files
committed
feat: add initial implementations of various file handling classes for CSV, XLS, and XLSX formats
1 parent c8ce7e2 commit 48297d8

36 files changed

+2427
-1
lines changed

apps/common/constants/permission_constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,14 @@ class PermissionConstants(Enum):
218218
RoleConstants.USER])
219219
KNOWLEDGE_DELETE = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
220220
RoleConstants.USER])
221+
DOCUMENT_READ = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
222+
RoleConstants.USER])
223+
DOCUMENT_CREATE = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
224+
RoleConstants.USER])
225+
DOCUMENT_EDIT = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
226+
RoleConstants.USER])
227+
DOCUMENT_DELETE = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
228+
RoleConstants.USER])
221229

222230
def get_workspace_application_permission(self):
223231
return lambda r, kwargs: Permission(group=self.value.group, operate=self.value.operate,

apps/common/handle/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# coding=utf-8
2+
"""
3+
@project: qabot
4+
@Author:虎
5+
@file: __init__.py.py
6+
@date:2023/9/6 10:09
7+
@desc:
8+
"""
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: base_parse_qa_handle.py
6+
@date:2024/5/21 14:56
7+
@desc:
8+
"""
9+
from abc import ABC, abstractmethod
10+
11+
12+
def get_row_value(row, title_row_index_dict, field):
13+
index = title_row_index_dict.get(field)
14+
if index is None:
15+
return None
16+
if (len(row) - 1) >= index:
17+
return row[index]
18+
return None
19+
20+
21+
def get_title_row_index_dict(title_row_list):
22+
title_row_index_dict = {}
23+
if len(title_row_list) == 1:
24+
title_row_index_dict['content'] = 0
25+
elif len(title_row_list) == 1:
26+
title_row_index_dict['title'] = 0
27+
title_row_index_dict['content'] = 1
28+
else:
29+
title_row_index_dict['title'] = 0
30+
title_row_index_dict['content'] = 1
31+
title_row_index_dict['problem_list'] = 2
32+
for index in range(len(title_row_list)):
33+
title_row = title_row_list[index]
34+
if title_row is None:
35+
title_row = ''
36+
if title_row.startswith('分段标题'):
37+
title_row_index_dict['title'] = index
38+
if title_row.startswith('分段内容'):
39+
title_row_index_dict['content'] = index
40+
if title_row.startswith('问题'):
41+
title_row_index_dict['problem_list'] = index
42+
return title_row_index_dict
43+
44+
45+
class BaseParseQAHandle(ABC):
46+
@abstractmethod
47+
def support(self, file, get_buffer):
48+
pass
49+
50+
@abstractmethod
51+
def handle(self, file, get_buffer, save_image):
52+
pass
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: base_parse_qa_handle.py
6+
@date:2024/5/21 14:56
7+
@desc:
8+
"""
9+
from abc import ABC, abstractmethod
10+
11+
12+
class BaseParseTableHandle(ABC):
13+
@abstractmethod
14+
def support(self, file, get_buffer):
15+
pass
16+
17+
@abstractmethod
18+
def handle(self, file, get_buffer,save_image):
19+
pass
20+
21+
@abstractmethod
22+
def get_content(self, file, save_image):
23+
pass
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: base_split_handle.py
6+
@date:2024/3/27 18:13
7+
@desc:
8+
"""
9+
from abc import ABC, abstractmethod
10+
from typing import List
11+
12+
13+
class BaseSplitHandle(ABC):
14+
@abstractmethod
15+
def support(self, file, get_buffer):
16+
pass
17+
18+
@abstractmethod
19+
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
20+
pass
21+
22+
@abstractmethod
23+
def get_content(self, file, save_image):
24+
pass
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# coding=utf-8
2+
"""
3+
@project: MaxKB
4+
@Author:虎
5+
@file: base_to_response.py
6+
@date:2024/9/6 16:04
7+
@desc:
8+
"""
9+
from abc import ABC, abstractmethod
10+
11+
from rest_framework import status
12+
13+
14+
class BaseToResponse(ABC):
15+
16+
@abstractmethod
17+
def to_block_response(self, chat_id, chat_record_id, content, is_end, completion_tokens,
18+
prompt_tokens, other_params: dict = None,
19+
_status=status.HTTP_200_OK):
20+
pass
21+
22+
@abstractmethod
23+
def to_stream_chunk_response(self, chat_id, chat_record_id, node_id, up_node_id_list, content, is_end,
24+
completion_tokens,
25+
prompt_tokens, other_params: dict = None):
26+
pass
27+
28+
@staticmethod
29+
def format_stream_chunk(response_str):
30+
return 'data: ' + response_str + '\n\n'
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# coding=utf-8
2+
"""
3+
@project: qabot
4+
@Author:虎
5+
@file: handle_exception.py
6+
@date:2023/9/5 19:29
7+
@desc:
8+
"""
9+
import logging
10+
import traceback
11+
12+
from rest_framework.exceptions import ValidationError, ErrorDetail, APIException
13+
from rest_framework.views import exception_handler
14+
15+
from common.exception.app_exception import AppApiException
16+
17+
from django.utils.translation import gettext_lazy as _
18+
19+
from common.result import result
20+
21+
22+
def to_result(key, args, parent_key=None):
23+
"""
24+
将校验异常 args转换为统一数据
25+
:param key: 校验key
26+
:param args: 校验异常参数
27+
:param parent_key 父key
28+
:return: 接口响应对象
29+
"""
30+
error_detail = list(filter(
31+
lambda d: True if isinstance(d, ErrorDetail) else True if isinstance(d, dict) and len(
32+
d.keys()) > 0 else False,
33+
(args[0] if len(args) > 0 else {key: [ErrorDetail(_('Unknown exception'), code='unknown')]}).get(key)))[0]
34+
35+
if isinstance(error_detail, dict):
36+
return list(map(lambda k: to_result(k, args=[error_detail],
37+
parent_key=key if parent_key is None else parent_key + '.' + key),
38+
error_detail.keys() if len(error_detail) > 0 else []))[0]
39+
40+
return result.Result(500 if isinstance(error_detail.code, str) else error_detail.code,
41+
message=f"【{key if parent_key is None else parent_key + '.' + key}】为必填参数" if str(
42+
error_detail) == "This field is required." else error_detail)
43+
44+
45+
def validation_error_to_result(exc: ValidationError):
46+
"""
47+
校验异常转响应对象
48+
:param exc: 校验异常
49+
:return: 接口响应对象
50+
"""
51+
try:
52+
v = find_err_detail(exc.detail)
53+
if v is None:
54+
return result.error(str(exc.detail))
55+
return result.error(str(v))
56+
except Exception as e:
57+
return result.error(str(exc.detail))
58+
59+
60+
def find_err_detail(exc_detail):
61+
if isinstance(exc_detail, ErrorDetail):
62+
return exc_detail
63+
if isinstance(exc_detail, dict):
64+
keys = exc_detail.keys()
65+
for key in keys:
66+
_value = exc_detail[key]
67+
if isinstance(_value, list):
68+
return find_err_detail(_value)
69+
if isinstance(_value, ErrorDetail):
70+
return _value
71+
if isinstance(_value, dict) and len(_value.keys()) > 0:
72+
return find_err_detail(_value)
73+
if isinstance(exc_detail, list):
74+
for v in exc_detail:
75+
r = find_err_detail(v)
76+
if r is not None:
77+
return r
78+
79+
80+
def handle_exception(exc, context):
81+
exception_class = exc.__class__
82+
# 先调用REST framework默认的异常处理方法获得标准错误响应对象
83+
response = exception_handler(exc, context)
84+
# 在此处补充自定义的异常处理
85+
if issubclass(exception_class, ValidationError):
86+
return validation_error_to_result(exc)
87+
if issubclass(exception_class, AppApiException):
88+
return result.Result(exc.code, exc.message, response_status=exc.status_code)
89+
if issubclass(exception_class, APIException):
90+
return result.error(exc.detail)
91+
if response is None:
92+
logging.getLogger("max_kb_error").error(f'{str(exc)}:{traceback.format_exc()}')
93+
return result.error(str(exc))
94+
return response
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# coding=utf-8
2+
"""
3+
@project: qabot
4+
@Author:虎
5+
@file: __init__.py.py
6+
@date:2023/9/6 10:09
7+
@desc:
8+
"""
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# coding=utf-8
2+
"""
3+
@project: MaxKB
4+
@Author:虎
5+
@file: tools.py
6+
@date:2024/9/11 16:41
7+
@desc:
8+
"""
9+
import io
10+
import uuid_utils.compat as uuid
11+
from functools import reduce
12+
from io import BytesIO
13+
from xml.etree.ElementTree import fromstring
14+
from zipfile import ZipFile
15+
16+
from PIL import Image as PILImage
17+
from openpyxl.drawing.image import Image as openpyxl_Image
18+
from openpyxl.packaging.relationship import get_rels_path, get_dependents
19+
from openpyxl.xml.constants import SHEET_DRAWING_NS, REL_NS, SHEET_MAIN_NS
20+
21+
from common.handle.base_parse_qa_handle import get_title_row_index_dict, get_row_value
22+
from knowledge.models import File
23+
24+
25+
def parse_element(element) -> {}:
26+
data = {}
27+
xdr_namespace = "{%s}" % SHEET_DRAWING_NS
28+
targets = level_order_traversal(element, xdr_namespace + "nvPicPr")
29+
for target in targets:
30+
cNvPr = embed = ""
31+
for child in target:
32+
if child.tag == xdr_namespace + "nvPicPr":
33+
cNvPr = child[0].attrib["name"]
34+
elif child.tag == xdr_namespace + "blipFill":
35+
_rel_embed = "{%s}embed" % REL_NS
36+
embed = child[0].attrib[_rel_embed]
37+
if cNvPr:
38+
data[cNvPr] = embed
39+
return data
40+
41+
42+
def parse_element_sheet_xml(element) -> []:
43+
data = []
44+
xdr_namespace = "{%s}" % SHEET_MAIN_NS
45+
targets = level_order_traversal(element, xdr_namespace + "f")
46+
for target in targets:
47+
for child in target:
48+
if child.tag == xdr_namespace + "f":
49+
data.append(child.text)
50+
return data
51+
52+
53+
def level_order_traversal(root, flag: str) -> []:
54+
queue = [root]
55+
targets = []
56+
while queue:
57+
node = queue.pop(0)
58+
children = [child.tag for child in node]
59+
if flag in children:
60+
targets.append(node)
61+
continue
62+
for child in node:
63+
queue.append(child)
64+
return targets
65+
66+
67+
def handle_images(deps, archive: ZipFile) -> []:
68+
images = []
69+
if not PILImage: # Pillow not installed, drop images
70+
return images
71+
for dep in deps:
72+
try:
73+
image_io = archive.read(dep.target)
74+
image = openpyxl_Image(BytesIO(image_io))
75+
except Exception as e:
76+
print(e)
77+
continue
78+
image.embed = dep.id # 文件rId
79+
image.target = dep.target # 文件地址
80+
images.append(image)
81+
return images
82+
83+
84+
def xlsx_embed_cells_images(buffer) -> {}:
85+
archive = ZipFile(buffer)
86+
# 解析cellImage.xml文件
87+
deps = get_dependents(archive, get_rels_path("xl/cellimages.xml"))
88+
image_rel = handle_images(deps=deps, archive=archive)
89+
# 工作表及其中图片ID
90+
sheet_list = {}
91+
for item in archive.namelist():
92+
if not item.startswith('xl/worksheets/sheet'):
93+
continue
94+
key = item.split('/')[-1].split('.')[0].split('sheet')[-1]
95+
sheet_list[key] = parse_element_sheet_xml(fromstring(archive.read(item)))
96+
cell_images_xml = parse_element(fromstring(archive.read("xl/cellimages.xml")))
97+
cell_images_rel = {}
98+
for image in image_rel:
99+
cell_images_rel[image.embed] = image
100+
for cnv, embed in cell_images_xml.items():
101+
cell_images_xml[cnv] = cell_images_rel.get(embed)
102+
result = {}
103+
for key, img in cell_images_xml.items():
104+
image_excel_id_list = [_xl for _xl in
105+
reduce(lambda x, y: [*x, *y], [sheet for sheet_id, sheet in sheet_list.items()], []) if
106+
key in _xl]
107+
if len(image_excel_id_list) > 0:
108+
image_excel_id = image_excel_id_list[-1]
109+
f = archive.open(img.target)
110+
img_byte = io.BytesIO()
111+
im = PILImage.open(f).convert('RGB')
112+
im.save(img_byte, format='JPEG')
113+
image = File(id=uuid.uuid7(), file_name=img.path, meta={'debug': False, 'content': img_byte.getvalue()})
114+
result['=' + image_excel_id] = image
115+
archive.close()
116+
return result

0 commit comments

Comments
 (0)