1Panel-dev
diff --git a/‎apps/common/handle/impl/csv_split_handle.py‎
Lines changed: 70 additions & 0 deletions b/‎apps/common/handle/impl/csv_split_handle.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎apps/common/handle/impl/qa/zip_parse_qa_handle.py‎
Lines changed: 162 additions & 0 deletions b/‎apps/common/handle/impl/qa/zip_parse_qa_handle.py‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎apps/common/handle/impl/xls_split_handle.py‎
Lines changed: 80 additions & 0 deletions b/‎apps/common/handle/impl/xls_split_handle.py‎
Lines changed: 80 additions & 0 deletions
@@ -0,0 +1,70 @@
+# coding=utf-8
+"""
+    @project: maxkb
+    @Author：虎
+    @file： csv_parse_qa_handle.py
+    @date：2024/5/21 14:59
+    @desc:
+"""
+import csv
+import io
+from typing import List
+
+from charset_normalizer import detect
+
+from common.handle.base_split_handle import BaseSplitHandle
+
+
+def post_cell(cell_value):
+    return cell_value.replace('\n', '<br>').replace('|', '&#124;')
+
+
+def row_to_md(row):
+    return '| ' + ' | '.join(
+        [post_cell(cell) if cell is not None else '' for cell in row]) + ' |\n'
+
+
+class CsvSplitHandle(BaseSplitHandle):
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
+        buffer = get_buffer(file)
+        paragraphs = []
+        result = {'name': file.name, 'content': paragraphs}
+        try:
+            reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
+            try:
+                title_row_list = reader.__next__()
+                title_md_content = row_to_md(title_row_list)
+                title_md_content += '| ' + ' | '.join(
+                    ['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
+            except Exception as e:
+                return result
+            if len(title_row_list) == 0:
+                return result
+            result_item_content = ''
+            for row in reader:
+                next_md_content = row_to_md(row)
+                next_md_content_len = len(next_md_content)
+                result_item_content_len = len(result_item_content)
+                if len(result_item_content) == 0:
+                    result_item_content += title_md_content
+                    result_item_content += next_md_content
+                else:
+                    if result_item_content_len + next_md_content_len < limit:
+                        result_item_content += next_md_content
+                    else:
+                        paragraphs.append({'content': result_item_content, 'title': ''})
+                        result_item_content = ''
+            if len(result_item_content) > 0:
+                paragraphs.append({'content': result_item_content, 'title': ''})
+            return result
+        except Exception as e:
+            return result
+
+    def get_content(self, file, save_image):
+        pass
+
+    def support(self, file, get_buffer):
+        file_name: str = file.name.lower()
+        if file_name.endswith(".csv"):
+            return True
+        return False
@@ -0,0 +1,162 @@
+# coding=utf-8
+"""
+    @project: maxkb
+    @Author：虎
+    @file： text_split_handle.py
+    @date：2024/3/27 18:19
+    @desc:
+"""
+import io
+import os
+import re
+import uuid
+import zipfile
+from typing import List
+from urllib.parse import urljoin
+
+from django.db.models import QuerySet
+
+from common.handle.base_parse_qa_handle import BaseParseQAHandle
+from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
+from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
+from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
+from common.util.common import parse_md_image
+from dataset.models import Image
+
+
+class FileBufferHandle:
+    buffer = None
+
+    def get_buffer(self, file):
+        if self.buffer is None:
+            self.buffer = file.read()
+        return self.buffer
+
+
+split_handles = [XlsParseQAHandle(), XlsxParseQAHandle(), CsvParseQAHandle()]
+
+
+def save_inner_image(image_list):
+    """
+    子模块插入图片逻辑
+    @param image_list:
+    @return:
+    """
+    if image_list is not None and len(image_list) > 0:
+        QuerySet(Image).bulk_create(image_list)
+
+
+def file_to_paragraph(file):
+    """
+    文件转换为段落列表
+    @param file: 文件
+    @return: {
+      name:文件名
+      paragraphs:段落列表
+    }
+    """
+    get_buffer = FileBufferHandle().get_buffer
+    for split_handle in split_handles:
+        if split_handle.support(file, get_buffer):
+            return split_handle.handle(file, get_buffer, save_inner_image)
+    raise Exception("不支持的文件格式")
+
+
+def is_valid_uuid(uuid_str: str):
+    """
+    校验字符串是否是uuid
+    @param uuid_str: 需要校验的字符串
+    @return: bool
+    """
+    try:
+        uuid.UUID(uuid_str)
+    except ValueError:
+        return False
+    return True
+
+
+def get_image_list(result_list: list, zip_files: List[str]):
+    """
+    获取图片文件列表
+    @param result_list:
+    @param zip_files:
+    @return:
+    """
+    image_file_list = []
+    for result in result_list:
+        for p in result.get('paragraphs', []):
+            content: str = p.get('content', '')
+            image_list = parse_md_image(content)
+            for image in image_list:
+                search = re.search("\(.*\)", image)
+                if search:
+                    new_image_id = str(uuid.uuid1())
+                    source_image_path = search.group().replace('(', '').replace(')', '')
+                    image_path = urljoin(result.get('name'), '.' + source_image_path if source_image_path.startswith(
+                        '/') else source_image_path)
+                    if not zip_files.__contains__(image_path):
+                        continue
+                    if image_path.startswith('api/file/') or image_path.startswith('api/image/'):
+                        image_id = image_path.replace('api/file/', '').replace('api/image/', '')
+                        if is_valid_uuid(image_id):
+                            image_file_list.append({'source_file': image_path,
+                                                    'image_id': image_id})
+                        else:
+                            image_file_list.append({'source_file': image_path,
+                                                    'image_id': new_image_id})
+                            content = content.replace(source_image_path, f'/api/image/{new_image_id}')
+                            p['content'] = content
+                    else:
+                        image_file_list.append({'source_file': image_path,
+                                                'image_id': new_image_id})
+                        content = content.replace(source_image_path, f'/api/image/{new_image_id}')
+                        p['content'] = content
+
+    return image_file_list
+
+
+def filter_image_file(result_list: list, image_list):
+    image_source_file_list = [image.get('source_file') for image in image_list]
+    return [r for r in result_list if not image_source_file_list.__contains__(r.get('name', ''))]
+
+
+class ZipParseQAHandle(BaseParseQAHandle):
+
+    def handle(self, file, get_buffer, save_image):
+        buffer = get_buffer(file)
+        bytes_io = io.BytesIO(buffer)
+        result = []
+        # 打开zip文件
+        with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
+            # 获取压缩包中的文件名列表
+            files = zip_ref.namelist()
+            # 读取压缩包中的文件内容
+            for file in files:
+                if file.endswith('/'):
+                    continue
+                with zip_ref.open(file) as f:
+                    # 对文件内容进行处理
+                    try:
+                        value = file_to_paragraph(f)
+                        if isinstance(value, list):
+                            result = [*result, *value]
+                        else:
+                            result.append(value)
+                    except Exception:
+                        pass
+            image_list = get_image_list(result, files)
+            result = filter_image_file(result, image_list)
+            image_mode_list = []
+            for image in image_list:
+                with zip_ref.open(image.get('source_file')) as f:
+                    i = Image(id=image.get('image_id'), image=f.read(),
+                              image_name=os.path.basename(image.get('source_file')))
+                    image_mode_list.append(i)
+            save_image(image_mode_list)
+        return result
+
+    def support(self, file, get_buffer):
+        file_name: str = file.name.lower()
+        if file_name.endswith(".zip") or file_name.endswith(".ZIP"):
+            return True
+        return False
@@ -0,0 +1,80 @@
+# coding=utf-8
+"""
+    @project: maxkb
+    @Author：虎
+    @file： xls_parse_qa_handle.py
+    @date：2024/5/21 14:59
+    @desc:
+"""
+from typing import List
+
+import xlrd
+
+from common.handle.base_split_handle import BaseSplitHandle
+
+
+def post_cell(cell_value):
+    return cell_value.replace('\n', '<br>').replace('|', '&#124;')
+
+
+def row_to_md(row):
+    return '| ' + ' | '.join(
+        [post_cell(str(cell)) if cell is not None else '' for cell in row]) + ' |\n'
+
+
+def handle_sheet(file_name, sheet, limit: int):
+    rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
+    paragraphs = []
+    result = {'name': file_name, 'content': paragraphs}
+    try:
+        title_row_list = next(rows)
+        title_md_content = row_to_md(title_row_list)
+        title_md_content += '| ' + ' | '.join(
+            ['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
+    except Exception as e:
+        return result
+    if len(title_row_list) == 0:
+        return result
+    result_item_content = ''
+    for row in rows:
+        next_md_content = row_to_md(row)
+        next_md_content_len = len(next_md_content)
+        result_item_content_len = len(result_item_content)
+        if len(result_item_content) == 0:
+            result_item_content += title_md_content
+            result_item_content += next_md_content
+        else:
+            if result_item_content_len + next_md_content_len < limit:
+                result_item_content += next_md_content
+            else:
+                paragraphs.append({'content': result_item_content, 'title': ''})
+                result_item_content = ''
+    if len(result_item_content) > 0:
+        paragraphs.append({'content': result_item_content, 'title': ''})
+    return result
+
+
+class XlsSplitHandle(BaseSplitHandle):
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
+        buffer = get_buffer(file)
+        try:
+            workbook = xlrd.open_workbook(file_contents=buffer)
+            worksheets = workbook.sheets()
+            worksheets_size = len(worksheets)
+            return [row for row in
+                    [handle_sheet(file.name,
+                                  sheet, limit) if worksheets_size == 1 and sheet.name == 'Sheet1' else handle_sheet(
+                        sheet.name, sheet, limit) for sheet
+                     in worksheets] if row is not None]
+        except Exception as e:
+            return [{'name': file.name, 'content': []}]
+
+    def get_content(self, file, save_image):
+        pass
+
+    def support(self, file, get_buffer):
+        file_name: str = file.name.lower()
+        buffer = get_buffer(file)
+        if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
+            return True
+        return False