|
| 1 | +# coding=utf-8 |
| 2 | +""" |
| 3 | + @project: MaxKB |
| 4 | + @Author:虎 |
| 5 | + @file: tools.py |
| 6 | + @date:2024/9/11 16:41 |
| 7 | + @desc: |
| 8 | +""" |
| 9 | +import io |
| 10 | +import uuid_utils.compat as uuid |
| 11 | +from functools import reduce |
| 12 | +from io import BytesIO |
| 13 | +from xml.etree.ElementTree import fromstring |
| 14 | +from zipfile import ZipFile |
| 15 | + |
| 16 | +from PIL import Image as PILImage |
| 17 | +from openpyxl.drawing.image import Image as openpyxl_Image |
| 18 | +from openpyxl.packaging.relationship import get_rels_path, get_dependents |
| 19 | +from openpyxl.xml.constants import SHEET_DRAWING_NS, REL_NS, SHEET_MAIN_NS |
| 20 | + |
| 21 | +from common.handle.base_parse_qa_handle import get_title_row_index_dict, get_row_value |
| 22 | +from knowledge.models import File |
| 23 | + |
| 24 | + |
| 25 | +def parse_element(element) -> {}: |
| 26 | + data = {} |
| 27 | + xdr_namespace = "{%s}" % SHEET_DRAWING_NS |
| 28 | + targets = level_order_traversal(element, xdr_namespace + "nvPicPr") |
| 29 | + for target in targets: |
| 30 | + cNvPr = embed = "" |
| 31 | + for child in target: |
| 32 | + if child.tag == xdr_namespace + "nvPicPr": |
| 33 | + cNvPr = child[0].attrib["name"] |
| 34 | + elif child.tag == xdr_namespace + "blipFill": |
| 35 | + _rel_embed = "{%s}embed" % REL_NS |
| 36 | + embed = child[0].attrib[_rel_embed] |
| 37 | + if cNvPr: |
| 38 | + data[cNvPr] = embed |
| 39 | + return data |
| 40 | + |
| 41 | + |
| 42 | +def parse_element_sheet_xml(element) -> []: |
| 43 | + data = [] |
| 44 | + xdr_namespace = "{%s}" % SHEET_MAIN_NS |
| 45 | + targets = level_order_traversal(element, xdr_namespace + "f") |
| 46 | + for target in targets: |
| 47 | + for child in target: |
| 48 | + if child.tag == xdr_namespace + "f": |
| 49 | + data.append(child.text) |
| 50 | + return data |
| 51 | + |
| 52 | + |
| 53 | +def level_order_traversal(root, flag: str) -> []: |
| 54 | + queue = [root] |
| 55 | + targets = [] |
| 56 | + while queue: |
| 57 | + node = queue.pop(0) |
| 58 | + children = [child.tag for child in node] |
| 59 | + if flag in children: |
| 60 | + targets.append(node) |
| 61 | + continue |
| 62 | + for child in node: |
| 63 | + queue.append(child) |
| 64 | + return targets |
| 65 | + |
| 66 | + |
| 67 | +def handle_images(deps, archive: ZipFile) -> []: |
| 68 | + images = [] |
| 69 | + if not PILImage: # Pillow not installed, drop images |
| 70 | + return images |
| 71 | + for dep in deps: |
| 72 | + try: |
| 73 | + image_io = archive.read(dep.target) |
| 74 | + image = openpyxl_Image(BytesIO(image_io)) |
| 75 | + except Exception as e: |
| 76 | + print(e) |
| 77 | + continue |
| 78 | + image.embed = dep.id # 文件rId |
| 79 | + image.target = dep.target # 文件地址 |
| 80 | + images.append(image) |
| 81 | + return images |
| 82 | + |
| 83 | + |
| 84 | +def xlsx_embed_cells_images(buffer) -> {}: |
| 85 | + archive = ZipFile(buffer) |
| 86 | + # 解析cellImage.xml文件 |
| 87 | + deps = get_dependents(archive, get_rels_path("xl/cellimages.xml")) |
| 88 | + image_rel = handle_images(deps=deps, archive=archive) |
| 89 | + # 工作表及其中图片ID |
| 90 | + sheet_list = {} |
| 91 | + for item in archive.namelist(): |
| 92 | + if not item.startswith('xl/worksheets/sheet'): |
| 93 | + continue |
| 94 | + key = item.split('/')[-1].split('.')[0].split('sheet')[-1] |
| 95 | + sheet_list[key] = parse_element_sheet_xml(fromstring(archive.read(item))) |
| 96 | + cell_images_xml = parse_element(fromstring(archive.read("xl/cellimages.xml"))) |
| 97 | + cell_images_rel = {} |
| 98 | + for image in image_rel: |
| 99 | + cell_images_rel[image.embed] = image |
| 100 | + for cnv, embed in cell_images_xml.items(): |
| 101 | + cell_images_xml[cnv] = cell_images_rel.get(embed) |
| 102 | + result = {} |
| 103 | + for key, img in cell_images_xml.items(): |
| 104 | + image_excel_id_list = [_xl for _xl in |
| 105 | + reduce(lambda x, y: [*x, *y], [sheet for sheet_id, sheet in sheet_list.items()], []) if |
| 106 | + key in _xl] |
| 107 | + if len(image_excel_id_list) > 0: |
| 108 | + image_excel_id = image_excel_id_list[-1] |
| 109 | + f = archive.open(img.target) |
| 110 | + img_byte = io.BytesIO() |
| 111 | + im = PILImage.open(f).convert('RGB') |
| 112 | + im.save(img_byte, format='JPEG') |
| 113 | + image = File(id=uuid.uuid7(), file_name=img.path, meta={'debug': False, 'content': img_byte.getvalue()}) |
| 114 | + result['=' + image_excel_id] = image |
| 115 | + archive.close() |
| 116 | + return result |
0 commit comments