Skip to content

Commit 99477d6

Browse files
committed
refactor: simplify file handling in document extraction by removing unnecessary byte conversion and enhancing file saving logic
1 parent e5a2c57 commit 99477d6

File tree

2 files changed

+36
-36
lines changed

2 files changed

+36
-36
lines changed

apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,14 @@
11
# coding=utf-8
2+
import ast
23
import io
3-
import mimetypes
44

5-
from django.core.files.uploadedfile import InMemoryUploadedFile
5+
import uuid_utils.compat as uuid
66
from django.db.models import QuerySet
77

88
from application.flow.i_step_node import NodeResult
99
from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
1010
from knowledge.models import File, FileSourceType
1111
from knowledge.serializers.document import split_handles, parse_table_handle_list, FileBufferHandle
12-
from oss.serializers.file import FileSerializer
13-
14-
15-
def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
16-
content_type, _ = mimetypes.guess_type(file_name)
17-
if content_type is None:
18-
# 如果未能识别,设置为默认的二进制文件类型
19-
content_type = "application/octet-stream"
20-
# 创建一个内存中的字节流对象
21-
file_stream = io.BytesIO(file_bytes)
22-
23-
# 获取文件大小
24-
file_size = len(file_bytes)
25-
26-
# 创建 InMemoryUploadedFile 对象
27-
uploaded_file = InMemoryUploadedFile(
28-
file=file_stream,
29-
field_name=None,
30-
name=file_name,
31-
content_type=content_type,
32-
size=file_size,
33-
charset=None,
34-
)
35-
return uploaded_file
36-
3712

3813
splitter = '\n`-----------------------------------`\n'
3914

@@ -69,17 +44,42 @@ def save_image(image_list):
6944
'file_id': str(image.id)
7045
}
7146
file_bytes = image.meta.pop('content')
72-
f = bytes_to_uploaded_file(file_bytes, image.file_name)
73-
FileSerializer(data={
74-
'file': f,
75-
'meta': meta,
76-
'source_id': meta['application_id'] if meta['application_id'] else meta['knowledge_id'],
77-
'source_type': FileSourceType.APPLICATION.value if meta[
78-
'application_id'] else FileSourceType.KNOWLEDGE.value
79-
}).upload()
47+
new_file = File(
48+
id=uuid.uuid7(),
49+
file_name=image.file_name,
50+
file_size=len(file_bytes),
51+
source_type=FileSourceType.APPLICATION.value if meta[
52+
'application_id'] else FileSourceType.KNOWLEDGE.value,
53+
source_id=meta['application_id'] if meta['application_id'] else meta['knowledge_id'],
54+
meta=meta
55+
)
56+
new_file.save(file_bytes)
8057

8158
document_list = []
8259
for doc in document:
60+
if 'file_bytes' in doc:
61+
file_bytes = doc['file_bytes']
62+
# 如果是字符串,转换为字节
63+
if isinstance(file_bytes, str):
64+
file_bytes = ast.literal_eval(file_bytes)
65+
doc['file_id'] = doc.get('file_id') or uuid.uuid7()
66+
meta = {
67+
'debug': False if (application_id or knowledge_id) else True,
68+
'chat_id': chat_id,
69+
'application_id': str(application_id) if application_id else None,
70+
'knowledge_id': str(knowledge_id) if knowledge_id else None,
71+
'file_id': str(doc['file_id'])
72+
}
73+
new_file = File(
74+
id=doc['file_id'],
75+
file_name=doc['name'],
76+
file_size=len(file_bytes),
77+
source_type=FileSourceType.APPLICATION.value if meta[
78+
'application_id'] else FileSourceType.KNOWLEDGE.value,
79+
source_id=meta['application_id'] if meta['application_id'] else meta['knowledge_id'],
80+
meta={}
81+
)
82+
new_file.save(file_bytes)
8383
file = QuerySet(File).filter(id=doc['file_id']).first()
8484
buffer = io.BytesIO(file.get_bytes())
8585
buffer.name = doc['name'] # this is the important line

apps/common/handle/impl/table/xlsx_parse_table_handle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,5 +112,5 @@ def get_content(self, file, save_image):
112112

113113
return md_tables
114114
except Exception as e:
115-
max_kb.error(f'excel split handle error: {e}')
115+
maxkb_logger.error(f'excel split handle error: {e}')
116116
return f'error: {e}'

0 commit comments

Comments
 (0)