|
22 | 22 | from common.event.common import work_thread_pool |
23 | 23 | from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs |
24 | 24 | from common.exception.app_exception import AppApiException |
| 25 | +from common.handle.impl.doc_split_handle import DocSplitHandle |
| 26 | +from common.handle.impl.pdf_split_handle import PdfSplitHandle |
| 27 | +from common.handle.impl.text_split_handle import TextSplitHandle |
25 | 28 | from common.mixins.api_mixin import ApiMixin |
26 | 29 | from common.util.common import post |
27 | 30 | from common.util.field_message import ErrMessage |
@@ -593,17 +596,22 @@ def batch_delete(self, instance: Dict, with_valid=True): |
593 | 596 | return True |
594 | 597 |
|
595 | 598 |
|
| 599 | +class FileBufferHandle: |
| 600 | + buffer = None |
| 601 | + |
| 602 | + def get_buffer(self, file): |
| 603 | + if self.buffer is None: |
| 604 | + self.buffer = file.read() |
| 605 | + return self.buffer |
| 606 | + |
| 607 | + |
| 608 | +default_split_handle = TextSplitHandle() |
| 609 | +split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle] |
| 610 | + |
| 611 | + |
596 | 612 | def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int): |
597 | | - data = file.read() |
598 | | - if pattern_list is not None and len(pattern_list) > 0: |
599 | | - split_model = SplitModel(pattern_list, with_filter, limit) |
600 | | - else: |
601 | | - split_model = get_split_model(file.name, with_filter=with_filter, limit=limit) |
602 | | - try: |
603 | | - content = data.decode(chardet.detect(data)['encoding']) |
604 | | - except BaseException as e: |
605 | | - return {'name': file.name, |
606 | | - 'content': []} |
607 | | - return {'name': file.name, |
608 | | - 'content': split_model.parse(content) |
609 | | - } |
| 613 | + get_buffer = FileBufferHandle().get_buffer |
| 614 | + for split_handle in split_handles: |
| 615 | + if split_handle.support(file, get_buffer): |
| 616 | + return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer) |
| 617 | + return default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer) |
0 commit comments