|
1 | 1 | # coding=utf-8 |
| 2 | +import io |
| 3 | + |
2 | 4 | from django.db.models import QuerySet |
3 | 5 |
|
4 | 6 | from application.flow.i_step_node import NodeResult |
5 | 7 | from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode |
6 | 8 | from dataset.models import File |
| 9 | +from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle |
7 | 10 |
|
8 | 11 |
|
9 | 12 | class BaseDocumentExtractNode(IDocumentExtractNode): |
10 | 13 | def execute(self, document, **kwargs): |
| 14 | + get_buffer = FileBufferHandle().get_buffer |
| 15 | + |
11 | 16 | self.context['document_list'] = document |
12 | 17 | content = '' |
13 | 18 | spliter = '\n-----------------------------------\n' |
14 | | - if len(document) > 0: |
15 | | - for doc in document: |
16 | | - file = QuerySet(File).filter(id=doc['file_id']).first() |
17 | | - file_type = doc['name'].split('.')[-1] |
18 | | - if file_type.lower() in ['txt', 'md', 'csv', 'html']: |
19 | | - content += spliter + doc['name'] + '\n' + file.get_byte().tobytes().decode('utf-8') |
| 19 | + if document is None: |
| 20 | + return NodeResult({'content': content}, {}) |
| 21 | + |
| 22 | + for doc in document: |
| 23 | + file = QuerySet(File).filter(id=doc['file_id']).first() |
| 24 | + buffer = io.BytesIO(file.get_byte().tobytes()) |
| 25 | + buffer.name = doc['name'] # this is the important line |
20 | 26 |
|
| 27 | + for split_handle in (parse_table_handle_list + split_handles): |
| 28 | + if split_handle.support(buffer, get_buffer): |
| 29 | + # 回到文件头 |
| 30 | + buffer.seek(0) |
| 31 | + file_content = split_handle.get_content(buffer) |
| 32 | + content += spliter + '## ' + doc['name'] + '\n' + file_content |
| 33 | + return NodeResult({'content': content}, {}) |
21 | 34 |
|
22 | 35 | return NodeResult({'content': content}, {}) |
23 | 36 |
|
|
0 commit comments