Skip to content

Commit 83d9743

Browse files
committed
fix: 修复导入word文档,有的图片导入不进去
1 parent aab9cc6 commit 83d9743

File tree

1 file changed

+35
-10
lines changed

1 file changed

+35
-10
lines changed

apps/common/handle/impl/doc_split_handle.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import re
1111
import traceback
1212
import uuid
13+
from functools import reduce
1314
from typing import List
1415

1516
from docx import Document, ImagePart
@@ -31,6 +32,7 @@
3132
old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
3233
combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
3334

35+
3436
def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=True):
3537
if is_new_docx:
3638
image_ids = image.xpath('.//a:blip/@r:embed')
@@ -46,18 +48,31 @@ def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=T
4648
return f'![](/api/image/{image_uuid})'
4749

4850

51+
def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id):
52+
images_xpath_list = [".//pic:pic", ".//w:pict"]
53+
images = []
54+
for images_xpath in images_xpath_list:
55+
try:
56+
_images = paragraph_element.xpath(images_xpath)
57+
if _images is not None and len(_images) > 0:
58+
for image in _images:
59+
images.append(image)
60+
except Exception as e:
61+
pass
62+
return images
63+
64+
65+
def images_to_string(images, doc: Document, images_list, get_image_id):
66+
return "".join(
67+
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
68+
item is not None])
69+
70+
4971
def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id):
5072
try:
51-
images = paragraph_element.xpath(".//pic:pic")
52-
old_docx_images = paragraph_element.xpath(".//w:pict")
73+
images = get_paragraph_element_images(paragraph_element, doc, images_list, get_image_id)
5374
if len(images) > 0:
54-
return "".join(
55-
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
56-
item is not None])
57-
elif len(old_docx_images) > 0:
58-
return "".join(
59-
[item for item in [image_to_mode(image, doc, images_list, get_image_id, is_new_docx=False) for image in old_docx_images] if
60-
item is not None])
75+
return images_to_string(images, doc, images_list, get_image_id)
6176
elif paragraph_element.text is not None:
6277
return paragraph_element.text
6378
return ""
@@ -101,8 +116,18 @@ def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_
101116
try:
102117
psn = paragraph.style.name
103118
if psn.startswith('Heading'):
104-
return "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
119+
title = "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
120+
images = reduce(lambda x, y: [*x, *y],
121+
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
122+
paragraph._element],
123+
[])
124+
125+
if len(images) > 0:
126+
return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len(
127+
paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id)
128+
return title
105129
except Exception as e:
130+
traceback.print_exc()
106131
return paragraph.text
107132
return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
108133

0 commit comments

Comments
 (0)