feat(dify_extractor): support PDF image extraction and fix DOCX hyperlinks (v0.0.8) (#2360)

yangzq50 · web-flow · commit 85054e7396db · 2026-01-06T09:46:54.000+08:00
* fix: correct docx hyperlink extraction

Iterate paragraph XML to detect w:hyperlink nodes and resolve external r:id relationships into Markdown links.

* feat(rag): implement image extraction in PdfExtractor

Extract and upload images from PDFs, embedding links in the extracted content.

* chore: Bump dify_extractor plugin version from 0.0.7 to 0.0.8.
diff --git a/tools/dify_extractor/manifest.yaml b/tools/dify_extractor/manifest.yaml
@@ -1,4 +1,4 @@
-version: 0.0.7
+version: 0.0.8
 type: plugin
 author: langgenius
 name: dify_extractor
diff --git a/tools/dify_extractor/tools/dify_extractor.py b/tools/dify_extractor/tools/dify_extractor.py
@@ -28,7 +28,7 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag
         if file_extension in {".xlsx", ".xls"}:
             extractor = ExcelExtractor(file_bytes, file_name)
         elif file_extension == ".pdf":
-            extractor = PdfExtractor(file_bytes, file_name)
+            extractor = PdfExtractor(self, file_bytes, file_name)
         elif file_extension in {".md", ".markdown", ".mdx"}:
             extractor = MarkdownExtractor(file_bytes, file_name, tool=self, autodetect_encoding=True)
         elif file_extension in {".htm", ".html"}:
diff --git a/tools/dify_extractor/tools/pdf_extractor.py b/tools/dify_extractor/tools/pdf_extractor.py
@@ -1,44 +1,134 @@
+import io
+import logging
+import mimetypes
+import uuid
 from collections.abc import Iterator
 from io import BytesIO
-from tools.extractor_base import BaseExtractor
+
+from dify_plugin import Tool
+
 from tools.document import Document, ExtractorResult
+from tools.extractor_base import BaseExtractor
+
+import pypdfium2
+import pypdfium2.raw as pdfium_c
+
+logger = logging.getLogger(__name__)
 
 
 class PdfExtractor(BaseExtractor):
     """Load pdf files.
 
-
     Args:
+        tool: Tool instance
         file_bytes: file bytes
         file_name: file name.
     """
 
-    def __init__(self, file_bytes: bytes, file_name: str):
+    # Magic bytes for image format detection: (magic_bytes, extension, mime_type)
+    IMAGE_FORMATS = [
+        (b"\xff\xd8\xff", "jpg", "image/jpeg"),
+        (b"\x89PNG\r\n\x1a\n", "png", "image/png"),
+        (b"\x00\x00\x00\x0c\x6a\x50\x20\x20\x0d\x0a\x87\x0a", "jp2", "image/jp2"),
+        (b"GIF8", "gif", "image/gif"),
+        (b"BM", "bmp", "image/bmp"),
+        (b"II*\x00", "tiff", "image/tiff"),
+        (b"MM\x00*", "tiff", "image/tiff"),
+        (b"II+\x00", "tiff", "image/tiff"),
+        (b"MM\x00+", "tiff", "image/tiff"),
+    ]
+    MAX_MAGIC_LEN = max(len(m) for m, _, _ in IMAGE_FORMATS)
+
+    def __init__(self, tool: Tool, file_bytes: bytes, file_name: str):
         self._file_bytes = file_bytes
         self._file_name = file_name
+        self._tool = tool
 
     def extract(self) -> ExtractorResult:
-        documents = list(self.parse())
+        documents, img_list = self.parse()
         text_list = []
         for document in documents:
             text_list.append(document.page_content)
         text = "\n\n".join(text_list)
 
-        return ExtractorResult(md_content=text, documents=documents)
-
-    def parse(self) -> Iterator[Document]:
-        """Lazily parse the bytes."""
-        import pypdfium2  # type: ignore
+        return ExtractorResult(md_content=text, documents=documents, img_list=img_list)
 
+    def parse(self) -> tuple[list[Document], list]:
+        """Parse the bytes and return documents and images."""
+        documents = []
+        img_list = []
         with BytesIO(self._file_bytes) as file:
             pdf_reader = pypdfium2.PdfDocument(file, autoclose=True)
             try:
                 for page_number, page in enumerate(pdf_reader):
                     text_page = page.get_textpage()
                     content = text_page.get_text_range()
                     text_page.close()
+
+                    image_content, page_img_list = self._extract_images(page)
+                    if image_content:
+                        content += "\n" + image_content
+                    img_list.extend(page_img_list)
+
                     page.close()
                     metadata = {"source": self._file_name, "page": page_number}
-                    yield Document(page_content=content, metadata=metadata)
+                    documents.append(Document(page_content=content, metadata=metadata))
             finally:
                 pdf_reader.close()
+        return documents, img_list
+
+    def _extract_images(self, page) -> tuple[str, list]:
+        """
+        Extract images from a PDF page, save them to storage,
+        and return markdown image links.
+
+        Args:
+            page: pypdfium2 page object.
+
+        Returns:
+            Markdown string containing links to the extracted images.
+        """
+        image_content = []
+        img_list = []
+
+        try:
+            image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))
+            for obj in image_objects:
+                try:
+                    # Extract image bytes
+                    img_byte_arr = io.BytesIO()
+                    # Extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly
+                    # Fallback to png for other formats
+                    obj.extract(img_byte_arr, fb_format="png")
+                    img_bytes = img_byte_arr.getvalue()
+
+                    if not img_bytes:
+                        continue
+
+                    header = img_bytes[: self.MAX_MAGIC_LEN]
+                    image_ext = None
+                    mime_type = None
+                    for magic, ext, mime in self.IMAGE_FORMATS:
+                        if header.startswith(magic):
+                            image_ext = ext
+                            mime_type = mime
+                            break
+
+                    if not image_ext or not mime_type:
+                        continue
+
+                    file_uuid = str(uuid.uuid4())
+                    file_name = file_uuid + "." + image_ext
+
+                    file_res = self._tool.session.file.upload(
+                        file_name, img_bytes, mime_type
+                    )
+                    image_content.append(f"![image]({file_res.preview_url})")
+                    img_list.append(file_res)
+                except Exception as e:
+                    logger.warning("Failed to extract image from PDF: %s", e)
+                    continue
+        except Exception as e:
+            logger.warning("Failed to get objects from PDF page: %s", e)
+
+        return "\n".join(image_content), img_list
diff --git a/tools/dify_extractor/tools/word_extractor.py b/tools/dify_extractor/tools/word_extractor.py
@@ -6,11 +6,11 @@
 import uuid
 from io import BytesIO
 from urllib.parse import urlparse
-from xml.etree import ElementTree
-
 import requests
 from dify_plugin import Tool
 from docx import Document as DocxDocument
+from docx.oxml.ns import qn
+from docx.text.run import Run
 
 from tools.document import Document, ExtractorResult
 from tools.extractor_base import BaseExtractor
@@ -156,7 +156,7 @@ def _parse_cell_paragraph(self, paragraph, image_map):
                     )
                     if not image_id:
                         continue
-                    
+
                     if image_id in paragraph.part.rels:
                         rel = paragraph.part.rels[image_id]
                         if rel.is_external:
@@ -194,40 +194,26 @@ def parse_docx(self, file_bytes):
 
         image_map, img_list = self._extract_images_from_docx(doc)
 
-        hyperlinks_url = None
-        url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
-        for para in doc.paragraphs:
-            for run in para.runs:
-                if run.text and hyperlinks_url:
-                    result = f"  [{run.text}]({hyperlinks_url})  "
-                    run.text = result
-                    hyperlinks_url = None
-                if "HYPERLINK" in run.element.xml:
-                    try:
-                        xml = ElementTree.XML(run.element.xml)
-                        x_child = [c for c in xml.iter() if c is not None]
-                        for x in x_child:
-                            if x_child is None:
-                                continue
-                            if x.tag.endswith("instrText"):
-                                if x.text is None:
-                                    continue
-                                for i in url_pattern.findall(x.text):
-                                    hyperlinks_url = str(i)
-                    except Exception:
-                        logger.exception("Failed to parse HYPERLINK xml")
-
         def parse_paragraph(paragraph):
-            paragraph_content = []
-            for run in paragraph.runs:
-                if (
-                    hasattr(run.element, "tag")
-                    and isinstance(run.element.tag, str)
-                    and run.element.tag.endswith("r")
-                ):
+            def append_image_link(image_id, has_drawing, target_buffer):
+                """Helper to append image link from image_map based on relationship type."""
+                rel = doc.part.rels[image_id]
+                if rel.is_external:
+                    if image_id in image_map and not has_drawing:
+                        target_buffer.append(image_map[image_id])
+                else:
+                    image_part = rel.target_part
+                    if image_part in image_map and not has_drawing:
+                        target_buffer.append(image_map[image_part])
+
+            def process_run(run, target_buffer):
+                # Helper to extract text and embedded images from a run element and append them to target_buffer
+                if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
+                    # Process drawing type images
                     drawing_elements = run.element.findall(
                         ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing"
                     )
+                    has_drawing = False
                     for drawing in drawing_elements:
                         blip_elements = drawing.findall(
                             ".//{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
@@ -237,14 +223,127 @@ def parse_paragraph(paragraph):
                                 "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
                             )
                             if embed_id:
-                                if embed_id in image_map:
-                                    paragraph_content.append(image_map[embed_id])
+                                rel = doc.part.rels.get(embed_id)
+                                if rel is not None and rel.is_external:
+                                    # External image: use embed_id as key
+                                    if embed_id in image_map:
+                                        has_drawing = True
+                                        target_buffer.append(image_map[embed_id])
                                 else:
+                                    # Internal image: use target_part as key
                                     image_part = doc.part.related_parts.get(embed_id)
                                     if image_part in image_map:
-                                        paragraph_content.append(image_map[image_part])
+                                        has_drawing = True
+                                        target_buffer.append(image_map[image_part])
+                    # Process pict type images
+                    shape_elements = run.element.findall(
+                        ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
+                    )
+                    for shape in shape_elements:
+                        # Find image data in VML
+                        shape_image = shape.find(
+                            ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}binData"
+                        )
+                        if shape_image is not None and shape_image.text:
+                            image_id = shape_image.get(
+                                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
+                            )
+                            if image_id and image_id in doc.part.rels:
+                                append_image_link(image_id, has_drawing, target_buffer)
+                        # Find imagedata element in VML
+                        image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
+                        if image_data is not None:
+                            image_id = image_data.get("id") or image_data.get(
+                                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
+                            )
+                            if image_id and image_id in doc.part.rels:
+                                append_image_link(image_id, has_drawing, target_buffer)
                 if run.text.strip():
-                    paragraph_content.append(run.text.strip())
+                    target_buffer.append(run.text.strip())
+
+            def process_hyperlink(hyperlink_elem, target_buffer):
+                # Helper to extract text from a hyperlink element and append it to target_buffer
+                r_id = hyperlink_elem.get(qn("r:id"))
+
+                # Extract text from runs inside the hyperlink
+                link_text_parts = []
+                for run_elem in hyperlink_elem.findall(qn("w:r")):
+                    run = Run(run_elem, paragraph)
+                    # Hyperlink text may be split across multiple runs (e.g., with different formatting),
+                    # so collect all run texts first
+                    if run.text:
+                        link_text_parts.append(run.text)
+
+                link_text = "".join(link_text_parts).strip()
+
+                # Resolve URL
+                if r_id:
+                    try:
+                        rel = doc.part.rels.get(r_id)
+                        if rel and rel.is_external:
+                            link_text = f"[{link_text or rel.target_ref}]({rel.target_ref})"
+                    except Exception:
+                        logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
+
+                if link_text:
+                    target_buffer.append(link_text)
+
+            paragraph_content = []
+            # State for legacy HYPERLINK fields
+            hyperlink_field_url = None
+            hyperlink_field_text_parts: list = []
+            is_collecting_field_text = False
+            # Iterate through paragraph elements in document order
+            for child in paragraph._element:
+                tag = child.tag
+                if tag == qn("w:r"):
+                    # Regular run
+                    run = Run(child, paragraph)
+
+                    # Check for fldChar (begin/end/separate) and instrText for legacy hyperlinks
+                    fld_chars = child.findall(qn("w:fldChar"))
+                    instr_texts = child.findall(qn("w:instrText"))
+
+                    # Handle Fields
+                    if fld_chars or instr_texts:
+                        # Process instrText to find HYPERLINK "url"
+                        for instr in instr_texts:
+                            if instr.text and "HYPERLINK" in instr.text:
+                                # Quick regex to extract URL
+                                match = re.search(r'HYPERLINK\s+"([^"]+)"', instr.text, re.IGNORECASE)
+                                if match:
+                                    hyperlink_field_url = match.group(1)
+
+                        # Process fldChar
+                        for fld_char in fld_chars:
+                            fld_char_type = fld_char.get(qn("w:fldCharType"))
+                            if fld_char_type == "begin":
+                                # Start of a field: reset legacy link state
+                                hyperlink_field_url = None
+                                hyperlink_field_text_parts = []
+                                is_collecting_field_text = False
+                            elif fld_char_type == "separate":
+                                # Separator: if we found a URL, start collecting visible text
+                                if hyperlink_field_url:
+                                    is_collecting_field_text = True
+                            elif fld_char_type == "end":
+                                # End of field
+                                if is_collecting_field_text and hyperlink_field_url:
+                                    # Create markdown link and append to main content
+                                    display_text = "".join(hyperlink_field_text_parts).strip()
+                                    if display_text:
+                                        link_md = f"[{display_text}]({hyperlink_field_url})"
+                                        paragraph_content.append(link_md)
+                                # Reset state
+                                hyperlink_field_url = None
+                                hyperlink_field_text_parts = []
+                                is_collecting_field_text = False
+
+                    # Decide where to append content
+                    target_buffer = hyperlink_field_text_parts if is_collecting_field_text else paragraph_content
+                    process_run(run, target_buffer)
+                elif tag == qn("w:hyperlink"):
+                    process_hyperlink(child, paragraph_content)
             return "".join(paragraph_content) if paragraph_content else ""
 
         paragraphs = doc.paragraphs.copy()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-version: 0.0.7`
	`1`	`+version: 0.0.8`
`2`	`2`	`type: plugin`
`3`	`3`	`author: langgenius`
`4`	`4`	`name: dify_extractor`