66import uuid
77from io import BytesIO
88from urllib .parse import urlparse
9- from xml .etree import ElementTree
10-
119import requests
1210from dify_plugin import Tool
1311from docx import Document as DocxDocument
12+ from docx .oxml .ns import qn
13+ from docx .text .run import Run
1414
1515from tools .document import Document , ExtractorResult
1616from tools .extractor_base import BaseExtractor
@@ -156,7 +156,7 @@ def _parse_cell_paragraph(self, paragraph, image_map):
156156 )
157157 if not image_id :
158158 continue
159-
159+
160160 if image_id in paragraph .part .rels :
161161 rel = paragraph .part .rels [image_id ]
162162 if rel .is_external :
@@ -194,40 +194,26 @@ def parse_docx(self, file_bytes):
194194
195195 image_map , img_list = self ._extract_images_from_docx (doc )
196196
197- hyperlinks_url = None
198- url_pattern = re .compile (r"http://[^\s+]+//|https://[^\s+]+" )
199- for para in doc .paragraphs :
200- for run in para .runs :
201- if run .text and hyperlinks_url :
202- result = f" [{ run .text } ]({ hyperlinks_url } ) "
203- run .text = result
204- hyperlinks_url = None
205- if "HYPERLINK" in run .element .xml :
206- try :
207- xml = ElementTree .XML (run .element .xml )
208- x_child = [c for c in xml .iter () if c is not None ]
209- for x in x_child :
210- if x_child is None :
211- continue
212- if x .tag .endswith ("instrText" ):
213- if x .text is None :
214- continue
215- for i in url_pattern .findall (x .text ):
216- hyperlinks_url = str (i )
217- except Exception :
218- logger .exception ("Failed to parse HYPERLINK xml" )
219-
220197 def parse_paragraph (paragraph ):
221- paragraph_content = []
222- for run in paragraph .runs :
223- if (
224- hasattr (run .element , "tag" )
225- and isinstance (run .element .tag , str )
226- and run .element .tag .endswith ("r" )
227- ):
198+ def append_image_link (image_id , has_drawing , target_buffer ):
199+ """Helper to append image link from image_map based on relationship type."""
200+ rel = doc .part .rels [image_id ]
201+ if rel .is_external :
202+ if image_id in image_map and not has_drawing :
203+ target_buffer .append (image_map [image_id ])
204+ else :
205+ image_part = rel .target_part
206+ if image_part in image_map and not has_drawing :
207+ target_buffer .append (image_map [image_part ])
208+
209+ def process_run (run , target_buffer ):
210+ # Helper to extract text and embedded images from a run element and append them to target_buffer
211+ if hasattr (run .element , "tag" ) and isinstance (run .element .tag , str ) and run .element .tag .endswith ("r" ):
212+ # Process drawing type images
228213 drawing_elements = run .element .findall (
229214 ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing"
230215 )
216+ has_drawing = False
231217 for drawing in drawing_elements :
232218 blip_elements = drawing .findall (
233219 ".//{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
@@ -237,14 +223,127 @@ def parse_paragraph(paragraph):
237223 "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
238224 )
239225 if embed_id :
240- if embed_id in image_map :
241- paragraph_content .append (image_map [embed_id ])
226+ rel = doc .part .rels .get (embed_id )
227+ if rel is not None and rel .is_external :
228+ # External image: use embed_id as key
229+ if embed_id in image_map :
230+ has_drawing = True
231+ target_buffer .append (image_map [embed_id ])
242232 else :
233+ # Internal image: use target_part as key
243234 image_part = doc .part .related_parts .get (embed_id )
244235 if image_part in image_map :
245- paragraph_content .append (image_map [image_part ])
236+ has_drawing = True
237+ target_buffer .append (image_map [image_part ])
238+ # Process pict type images
239+ shape_elements = run .element .findall (
240+ ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
241+ )
242+ for shape in shape_elements :
243+ # Find image data in VML
244+ shape_image = shape .find (
245+ ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}binData"
246+ )
247+ if shape_image is not None and shape_image .text :
248+ image_id = shape_image .get (
249+ "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
250+ )
251+ if image_id and image_id in doc .part .rels :
252+ append_image_link (image_id , has_drawing , target_buffer )
253+ # Find imagedata element in VML
254+ image_data = shape .find (".//{urn:schemas-microsoft-com:vml}imagedata" )
255+ if image_data is not None :
256+ image_id = image_data .get ("id" ) or image_data .get (
257+ "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
258+ )
259+ if image_id and image_id in doc .part .rels :
260+ append_image_link (image_id , has_drawing , target_buffer )
246261 if run .text .strip ():
247- paragraph_content .append (run .text .strip ())
262+ target_buffer .append (run .text .strip ())
263+
264+ def process_hyperlink (hyperlink_elem , target_buffer ):
265+ # Helper to extract text from a hyperlink element and append it to target_buffer
266+ r_id = hyperlink_elem .get (qn ("r:id" ))
267+
268+ # Extract text from runs inside the hyperlink
269+ link_text_parts = []
270+ for run_elem in hyperlink_elem .findall (qn ("w:r" )):
271+ run = Run (run_elem , paragraph )
272+ # Hyperlink text may be split across multiple runs (e.g., with different formatting),
273+ # so collect all run texts first
274+ if run .text :
275+ link_text_parts .append (run .text )
276+
277+ link_text = "" .join (link_text_parts ).strip ()
278+
279+ # Resolve URL
280+ if r_id :
281+ try :
282+ rel = doc .part .rels .get (r_id )
283+ if rel and rel .is_external :
284+ link_text = f"[{ link_text or rel .target_ref } ]({ rel .target_ref } )"
285+ except Exception :
286+ logger .exception ("Failed to resolve URL for hyperlink with r:id: %s" , r_id )
287+
288+ if link_text :
289+ target_buffer .append (link_text )
290+
291+ paragraph_content = []
292+ # State for legacy HYPERLINK fields
293+ hyperlink_field_url = None
294+ hyperlink_field_text_parts : list = []
295+ is_collecting_field_text = False
296+ # Iterate through paragraph elements in document order
297+ for child in paragraph ._element :
298+ tag = child .tag
299+ if tag == qn ("w:r" ):
300+ # Regular run
301+ run = Run (child , paragraph )
302+
303+ # Check for fldChar (begin/end/separate) and instrText for legacy hyperlinks
304+ fld_chars = child .findall (qn ("w:fldChar" ))
305+ instr_texts = child .findall (qn ("w:instrText" ))
306+
307+ # Handle Fields
308+ if fld_chars or instr_texts :
309+ # Process instrText to find HYPERLINK "url"
310+ for instr in instr_texts :
311+ if instr .text and "HYPERLINK" in instr .text :
312+ # Quick regex to extract URL
313+ match = re .search (r'HYPERLINK\s+"([^"]+)"' , instr .text , re .IGNORECASE )
314+ if match :
315+ hyperlink_field_url = match .group (1 )
316+
317+ # Process fldChar
318+ for fld_char in fld_chars :
319+ fld_char_type = fld_char .get (qn ("w:fldCharType" ))
320+ if fld_char_type == "begin" :
321+ # Start of a field: reset legacy link state
322+ hyperlink_field_url = None
323+ hyperlink_field_text_parts = []
324+ is_collecting_field_text = False
325+ elif fld_char_type == "separate" :
326+ # Separator: if we found a URL, start collecting visible text
327+ if hyperlink_field_url :
328+ is_collecting_field_text = True
329+ elif fld_char_type == "end" :
330+ # End of field
331+ if is_collecting_field_text and hyperlink_field_url :
332+ # Create markdown link and append to main content
333+ display_text = "" .join (hyperlink_field_text_parts ).strip ()
334+ if display_text :
335+ link_md = f"[{ display_text } ]({ hyperlink_field_url } )"
336+ paragraph_content .append (link_md )
337+ # Reset state
338+ hyperlink_field_url = None
339+ hyperlink_field_text_parts = []
340+ is_collecting_field_text = False
341+
342+ # Decide where to append content
343+ target_buffer = hyperlink_field_text_parts if is_collecting_field_text else paragraph_content
344+ process_run (run , target_buffer )
345+ elif tag == qn ("w:hyperlink" ):
346+ process_hyperlink (child , paragraph_content )
248347 return "" .join (paragraph_content ) if paragraph_content else ""
249348
250349 paragraphs = doc .paragraphs .copy ()
0 commit comments