1010import re
1111import traceback
1212import uuid
13+ from functools import reduce
1314from typing import List
1415
1516from docx import Document , ImagePart
3132old_docx_nsmap = {'v' : 'urn:schemas-microsoft-com:vml' }
3233combine_nsmap = {** ns .nsmap , ** old_docx_nsmap }
3334
35+
3436def image_to_mode (image , doc : Document , images_list , get_image_id , is_new_docx = True ):
3537 if is_new_docx :
3638 image_ids = image .xpath ('.//a:blip/@r:embed' )
@@ -46,18 +48,31 @@ def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=T
4648 return f''
4749
4850
51+ def get_paragraph_element_images (paragraph_element , doc : Document , images_list , get_image_id ):
52+ images_xpath_list = [".//pic:pic" , ".//w:pict" ]
53+ images = []
54+ for images_xpath in images_xpath_list :
55+ try :
56+ _images = paragraph_element .xpath (images_xpath )
57+ if _images is not None and len (_images ) > 0 :
58+ for image in _images :
59+ images .append (image )
60+ except Exception as e :
61+ pass
62+ return images
63+
64+
65+ def images_to_string (images , doc : Document , images_list , get_image_id ):
66+ return "" .join (
67+ [item for item in [image_to_mode (image , doc , images_list , get_image_id ) for image in images ] if
68+ item is not None ])
69+
70+
4971def get_paragraph_element_txt (paragraph_element , doc : Document , images_list , get_image_id ):
5072 try :
51- images = paragraph_element .xpath (".//pic:pic" )
52- old_docx_images = paragraph_element .xpath (".//w:pict" )
73+ images = get_paragraph_element_images (paragraph_element , doc , images_list , get_image_id )
5374 if len (images ) > 0 :
54- return "" .join (
55- [item for item in [image_to_mode (image , doc , images_list , get_image_id ) for image in images ] if
56- item is not None ])
57- elif len (old_docx_images ) > 0 :
58- return "" .join (
59- [item for item in [image_to_mode (image , doc , images_list , get_image_id , is_new_docx = False ) for image in old_docx_images ] if
60- item is not None ])
75+ return images_to_string (images , doc , images_list , get_image_id )
6176 elif paragraph_element .text is not None :
6277 return paragraph_element .text
6378 return ""
@@ -101,8 +116,18 @@ def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_
101116 try :
102117 psn = paragraph .style .name
103118 if psn .startswith ('Heading' ):
104- return "" .join (["#" for i in range (int (psn .replace ("Heading " , '' )))]) + " " + paragraph .text
119+ title = "" .join (["#" for i in range (int (psn .replace ("Heading " , '' )))]) + " " + paragraph .text
120+ images = reduce (lambda x , y : [* x , * y ],
121+ [get_paragraph_element_images (e , doc , images_list , get_image_id ) for e in
122+ paragraph ._element ],
123+ [])
124+
125+ if len (images ) > 0 :
126+ return title + '\n ' + images_to_string (images , doc , images_list , get_image_id ) if len (
127+ paragraph .text ) > 0 else images_to_string (images , doc , images_list , get_image_id )
128+ return title
105129 except Exception as e :
130+ traceback .print_exc ()
106131 return paragraph .text
107132 return get_paragraph_txt (paragraph , doc , images_list , get_image_id )
108133
0 commit comments