|
1 | 1 | # coding=utf-8 |
| 2 | +import io |
| 3 | +import mimetypes |
| 4 | + |
| 5 | +from django.core.files.uploadedfile import InMemoryUploadedFile |
2 | 6 | from django.db.models import QuerySet |
3 | 7 |
|
4 | 8 | from application.flow.i_step_node import NodeResult |
5 | 9 | from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode |
6 | | -from knowledge.models import File |
| 10 | +from knowledge.models import File, FileSourceType |
7 | 11 | from knowledge.serializers.document import split_handles, FileBufferHandle |
8 | 12 |
|
9 | 13 |
|
| 14 | +def bytes_to_uploaded_file(file_bytes, file_name="file.txt"): |
| 15 | + content_type, _ = mimetypes.guess_type(file_name) |
| 16 | + if content_type is None: |
| 17 | + # 如果未能识别,设置为默认的二进制文件类型 |
| 18 | + content_type = "application/octet-stream" |
| 19 | + # 创建一个内存中的字节流对象 |
| 20 | + file_stream = io.BytesIO(file_bytes) |
| 21 | + |
| 22 | + # 获取文件大小 |
| 23 | + file_size = len(file_bytes) |
| 24 | + |
| 25 | + # 创建 InMemoryUploadedFile 对象 |
| 26 | + uploaded_file = InMemoryUploadedFile( |
| 27 | + file=file_stream, |
| 28 | + field_name=None, |
| 29 | + name=file_name, |
| 30 | + content_type=content_type, |
| 31 | + size=file_size, |
| 32 | + charset=None, |
| 33 | + ) |
| 34 | + return uploaded_file |
| 35 | + |
| 36 | + |
10 | 37 | class BaseDocumentSplitNode(IDocumentSplitNode): |
11 | 38 | def save_context(self, details, workflow_manage): |
12 | 39 | self.context['content'] = details.get('content') |
13 | | - print(details) |
14 | 40 |
|
15 | | - def execute(self, file_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type, |
| 41 | + def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type, |
16 | 42 | paragraph_title_relate_problem, paragraph_title_relate_problem_reference, |
17 | 43 | document_name_relate_problem_type, document_name_relate_problem, |
18 | 44 | document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult: |
19 | 45 | get_buffer = FileBufferHandle().get_buffer |
| 46 | + self.context['file_list'] = files |
| 47 | + self.context['knowledge_id'] = knowledge_id |
20 | 48 |
|
21 | 49 | paragraph_list = [] |
22 | | - for doc in file_list: |
| 50 | + for doc in files: |
23 | 51 | file = QuerySet(File).filter(id=doc['file_id']).first() |
24 | | - file_id = file.id |
| 52 | + file_mem = bytes_to_uploaded_file(file.get_bytes(), file_name=file.file_name) |
| 53 | + |
25 | 54 | for split_handle in split_handles: |
26 | | - if split_handle.support(file, get_buffer): |
27 | | - result = split_handle.handle(file, patterns, with_filter, limit, get_buffer, self.save_image) |
| 55 | + if split_handle.support(file_mem, get_buffer): |
| 56 | + result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self.save_image) |
28 | 57 | if isinstance(result, list): |
29 | 58 | for item in result: |
30 | | - item['source_file_id'] = file_id |
| 59 | + item['source_file_id'] = file.id |
31 | 60 | paragraph_list = result |
32 | 61 | else: |
33 | | - result['source_file_id'] = file_id |
| 62 | + result['source_file_id'] = file.id |
34 | 63 | paragraph_list = [result] |
35 | 64 |
|
36 | | - self.context['file_list'] = file_list |
37 | 65 | self.context['paragraph_list'] = paragraph_list |
38 | 66 |
|
39 | | - print(paragraph_list) |
40 | 67 |
|
41 | 68 | return NodeResult({'paragraph_list': paragraph_list}, {}) |
42 | 69 |
|
43 | 70 | def save_image(self, image_list): |
44 | | - # if image_list is not None and len(image_list) > 0: |
45 | | - # exist_image_list = [str(i.get('id')) for i in |
46 | | - # QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')] |
47 | | - # save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))] |
48 | | - # save_image_list = list({img.id: img for img in save_image_list}.values()) |
49 | | - # # save image |
50 | | - # for file in save_image_list: |
51 | | - # file_bytes = file.meta.pop('content') |
52 | | - # file.meta['knowledge_id'] = self.data.get('knowledge_id') |
53 | | - # file.source_type = FileSourceType.KNOWLEDGE |
54 | | - # file.source_id = self.data.get('knowledge_id') |
55 | | - # file.save(file_bytes) |
56 | | - pass |
| 71 | + if image_list is not None and len(image_list) > 0: |
| 72 | + exist_image_list = [str(i.get('id')) for i in |
| 73 | + QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')] |
| 74 | + save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))] |
| 75 | + save_image_list = list({img.id: img for img in save_image_list}.values()) |
| 76 | + # save image |
| 77 | + for file in save_image_list: |
| 78 | + file_bytes = file.meta.pop('content') |
| 79 | + file.meta['knowledge_id'] = self.context.get('knowledge_id') |
| 80 | + file.source_type = FileSourceType.KNOWLEDGE |
| 81 | + file.source_id = self.context.get('knowledge_id') |
| 82 | + file.save(file_bytes) |
57 | 83 |
|
58 | 84 | def get_details(self, index: int, **kwargs): |
59 | 85 | return { |
|
0 commit comments