Skip to content

Commit 85054e7

Browse files
authored
feat(dify_extractor): support PDF image extraction and fix DOCX hyperlinks (v0.0.8) (#2360)
* fix: correct docx hyperlink extraction Iterate paragraph XML to detect w:hyperlink nodes and resolve external r:id relationships into Markdown links. * feat(rag): implement image extraction in PdfExtractor Extract and upload images from PDFs, embedding links in the extracted content. * chore: Bump dify_extractor plugin version from 0.0.7 to 0.0.8.
1 parent 795016d commit 85054e7

File tree

4 files changed

+238
-49
lines changed

4 files changed

+238
-49
lines changed

tools/dify_extractor/manifest.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.0.7
1+
version: 0.0.8
22
type: plugin
33
author: langgenius
44
name: dify_extractor

tools/dify_extractor/tools/dify_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag
2828
if file_extension in {".xlsx", ".xls"}:
2929
extractor = ExcelExtractor(file_bytes, file_name)
3030
elif file_extension == ".pdf":
31-
extractor = PdfExtractor(file_bytes, file_name)
31+
extractor = PdfExtractor(self, file_bytes, file_name)
3232
elif file_extension in {".md", ".markdown", ".mdx"}:
3333
extractor = MarkdownExtractor(file_bytes, file_name, tool=self, autodetect_encoding=True)
3434
elif file_extension in {".htm", ".html"}:
Lines changed: 100 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,134 @@
1+
import io
2+
import logging
3+
import mimetypes
4+
import uuid
15
from collections.abc import Iterator
26
from io import BytesIO
3-
from tools.extractor_base import BaseExtractor
7+
8+
from dify_plugin import Tool
9+
410
from tools.document import Document, ExtractorResult
11+
from tools.extractor_base import BaseExtractor
12+
13+
import pypdfium2
14+
import pypdfium2.raw as pdfium_c
15+
16+
logger = logging.getLogger(__name__)
517

618

719
class PdfExtractor(BaseExtractor):
820
"""Load pdf files.
921
10-
1122
Args:
23+
tool: Tool instance
1224
file_bytes: file bytes
1325
file_name: file name.
1426
"""
1527

16-
def __init__(self, file_bytes: bytes, file_name: str):
28+
# Magic bytes for image format detection: (magic_bytes, extension, mime_type)
29+
IMAGE_FORMATS = [
30+
(b"\xff\xd8\xff", "jpg", "image/jpeg"),
31+
(b"\x89PNG\r\n\x1a\n", "png", "image/png"),
32+
(b"\x00\x00\x00\x0c\x6a\x50\x20\x20\x0d\x0a\x87\x0a", "jp2", "image/jp2"),
33+
(b"GIF8", "gif", "image/gif"),
34+
(b"BM", "bmp", "image/bmp"),
35+
(b"II*\x00", "tiff", "image/tiff"),
36+
(b"MM\x00*", "tiff", "image/tiff"),
37+
(b"II+\x00", "tiff", "image/tiff"),
38+
(b"MM\x00+", "tiff", "image/tiff"),
39+
]
40+
MAX_MAGIC_LEN = max(len(m) for m, _, _ in IMAGE_FORMATS)
41+
42+
def __init__(self, tool: Tool, file_bytes: bytes, file_name: str):
1743
self._file_bytes = file_bytes
1844
self._file_name = file_name
45+
self._tool = tool
1946

2047
def extract(self) -> ExtractorResult:
21-
documents = list(self.parse())
48+
documents, img_list = self.parse()
2249
text_list = []
2350
for document in documents:
2451
text_list.append(document.page_content)
2552
text = "\n\n".join(text_list)
2653

27-
return ExtractorResult(md_content=text, documents=documents)
28-
29-
def parse(self) -> Iterator[Document]:
30-
"""Lazily parse the bytes."""
31-
import pypdfium2 # type: ignore
54+
return ExtractorResult(md_content=text, documents=documents, img_list=img_list)
3255

56+
def parse(self) -> tuple[list[Document], list]:
57+
"""Parse the bytes and return documents and images."""
58+
documents = []
59+
img_list = []
3360
with BytesIO(self._file_bytes) as file:
3461
pdf_reader = pypdfium2.PdfDocument(file, autoclose=True)
3562
try:
3663
for page_number, page in enumerate(pdf_reader):
3764
text_page = page.get_textpage()
3865
content = text_page.get_text_range()
3966
text_page.close()
67+
68+
image_content, page_img_list = self._extract_images(page)
69+
if image_content:
70+
content += "\n" + image_content
71+
img_list.extend(page_img_list)
72+
4073
page.close()
4174
metadata = {"source": self._file_name, "page": page_number}
42-
yield Document(page_content=content, metadata=metadata)
75+
documents.append(Document(page_content=content, metadata=metadata))
4376
finally:
4477
pdf_reader.close()
78+
return documents, img_list
79+
80+
def _extract_images(self, page) -> tuple[str, list]:
81+
"""
82+
Extract images from a PDF page, save them to storage,
83+
and return markdown image links.
84+
85+
Args:
86+
page: pypdfium2 page object.
87+
88+
Returns:
89+
Markdown string containing links to the extracted images.
90+
"""
91+
image_content = []
92+
img_list = []
93+
94+
try:
95+
image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))
96+
for obj in image_objects:
97+
try:
98+
# Extract image bytes
99+
img_byte_arr = io.BytesIO()
100+
# Extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly
101+
# Fallback to png for other formats
102+
obj.extract(img_byte_arr, fb_format="png")
103+
img_bytes = img_byte_arr.getvalue()
104+
105+
if not img_bytes:
106+
continue
107+
108+
header = img_bytes[: self.MAX_MAGIC_LEN]
109+
image_ext = None
110+
mime_type = None
111+
for magic, ext, mime in self.IMAGE_FORMATS:
112+
if header.startswith(magic):
113+
image_ext = ext
114+
mime_type = mime
115+
break
116+
117+
if not image_ext or not mime_type:
118+
continue
119+
120+
file_uuid = str(uuid.uuid4())
121+
file_name = file_uuid + "." + image_ext
122+
123+
file_res = self._tool.session.file.upload(
124+
file_name, img_bytes, mime_type
125+
)
126+
image_content.append(f"![image]({file_res.preview_url})")
127+
img_list.append(file_res)
128+
except Exception as e:
129+
logger.warning("Failed to extract image from PDF: %s", e)
130+
continue
131+
except Exception as e:
132+
logger.warning("Failed to get objects from PDF page: %s", e)
133+
134+
return "\n".join(image_content), img_list

tools/dify_extractor/tools/word_extractor.py

Lines changed: 136 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@
66
import uuid
77
from io import BytesIO
88
from urllib.parse import urlparse
9-
from xml.etree import ElementTree
10-
119
import requests
1210
from dify_plugin import Tool
1311
from docx import Document as DocxDocument
12+
from docx.oxml.ns import qn
13+
from docx.text.run import Run
1414

1515
from tools.document import Document, ExtractorResult
1616
from tools.extractor_base import BaseExtractor
@@ -156,7 +156,7 @@ def _parse_cell_paragraph(self, paragraph, image_map):
156156
)
157157
if not image_id:
158158
continue
159-
159+
160160
if image_id in paragraph.part.rels:
161161
rel = paragraph.part.rels[image_id]
162162
if rel.is_external:
@@ -194,40 +194,26 @@ def parse_docx(self, file_bytes):
194194

195195
image_map, img_list = self._extract_images_from_docx(doc)
196196

197-
hyperlinks_url = None
198-
url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
199-
for para in doc.paragraphs:
200-
for run in para.runs:
201-
if run.text and hyperlinks_url:
202-
result = f" [{run.text}]({hyperlinks_url}) "
203-
run.text = result
204-
hyperlinks_url = None
205-
if "HYPERLINK" in run.element.xml:
206-
try:
207-
xml = ElementTree.XML(run.element.xml)
208-
x_child = [c for c in xml.iter() if c is not None]
209-
for x in x_child:
210-
if x_child is None:
211-
continue
212-
if x.tag.endswith("instrText"):
213-
if x.text is None:
214-
continue
215-
for i in url_pattern.findall(x.text):
216-
hyperlinks_url = str(i)
217-
except Exception:
218-
logger.exception("Failed to parse HYPERLINK xml")
219-
220197
def parse_paragraph(paragraph):
221-
paragraph_content = []
222-
for run in paragraph.runs:
223-
if (
224-
hasattr(run.element, "tag")
225-
and isinstance(run.element.tag, str)
226-
and run.element.tag.endswith("r")
227-
):
198+
def append_image_link(image_id, has_drawing, target_buffer):
199+
"""Helper to append image link from image_map based on relationship type."""
200+
rel = doc.part.rels[image_id]
201+
if rel.is_external:
202+
if image_id in image_map and not has_drawing:
203+
target_buffer.append(image_map[image_id])
204+
else:
205+
image_part = rel.target_part
206+
if image_part in image_map and not has_drawing:
207+
target_buffer.append(image_map[image_part])
208+
209+
def process_run(run, target_buffer):
210+
# Helper to extract text and embedded images from a run element and append them to target_buffer
211+
if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
212+
# Process drawing type images
228213
drawing_elements = run.element.findall(
229214
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing"
230215
)
216+
has_drawing = False
231217
for drawing in drawing_elements:
232218
blip_elements = drawing.findall(
233219
".//{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
@@ -237,14 +223,127 @@ def parse_paragraph(paragraph):
237223
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
238224
)
239225
if embed_id:
240-
if embed_id in image_map:
241-
paragraph_content.append(image_map[embed_id])
226+
rel = doc.part.rels.get(embed_id)
227+
if rel is not None and rel.is_external:
228+
# External image: use embed_id as key
229+
if embed_id in image_map:
230+
has_drawing = True
231+
target_buffer.append(image_map[embed_id])
242232
else:
233+
# Internal image: use target_part as key
243234
image_part = doc.part.related_parts.get(embed_id)
244235
if image_part in image_map:
245-
paragraph_content.append(image_map[image_part])
236+
has_drawing = True
237+
target_buffer.append(image_map[image_part])
238+
# Process pict type images
239+
shape_elements = run.element.findall(
240+
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
241+
)
242+
for shape in shape_elements:
243+
# Find image data in VML
244+
shape_image = shape.find(
245+
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}binData"
246+
)
247+
if shape_image is not None and shape_image.text:
248+
image_id = shape_image.get(
249+
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
250+
)
251+
if image_id and image_id in doc.part.rels:
252+
append_image_link(image_id, has_drawing, target_buffer)
253+
# Find imagedata element in VML
254+
image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
255+
if image_data is not None:
256+
image_id = image_data.get("id") or image_data.get(
257+
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
258+
)
259+
if image_id and image_id in doc.part.rels:
260+
append_image_link(image_id, has_drawing, target_buffer)
246261
if run.text.strip():
247-
paragraph_content.append(run.text.strip())
262+
target_buffer.append(run.text.strip())
263+
264+
def process_hyperlink(hyperlink_elem, target_buffer):
265+
# Helper to extract text from a hyperlink element and append it to target_buffer
266+
r_id = hyperlink_elem.get(qn("r:id"))
267+
268+
# Extract text from runs inside the hyperlink
269+
link_text_parts = []
270+
for run_elem in hyperlink_elem.findall(qn("w:r")):
271+
run = Run(run_elem, paragraph)
272+
# Hyperlink text may be split across multiple runs (e.g., with different formatting),
273+
# so collect all run texts first
274+
if run.text:
275+
link_text_parts.append(run.text)
276+
277+
link_text = "".join(link_text_parts).strip()
278+
279+
# Resolve URL
280+
if r_id:
281+
try:
282+
rel = doc.part.rels.get(r_id)
283+
if rel and rel.is_external:
284+
link_text = f"[{link_text or rel.target_ref}]({rel.target_ref})"
285+
except Exception:
286+
logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
287+
288+
if link_text:
289+
target_buffer.append(link_text)
290+
291+
paragraph_content = []
292+
# State for legacy HYPERLINK fields
293+
hyperlink_field_url = None
294+
hyperlink_field_text_parts: list = []
295+
is_collecting_field_text = False
296+
# Iterate through paragraph elements in document order
297+
for child in paragraph._element:
298+
tag = child.tag
299+
if tag == qn("w:r"):
300+
# Regular run
301+
run = Run(child, paragraph)
302+
303+
# Check for fldChar (begin/end/separate) and instrText for legacy hyperlinks
304+
fld_chars = child.findall(qn("w:fldChar"))
305+
instr_texts = child.findall(qn("w:instrText"))
306+
307+
# Handle Fields
308+
if fld_chars or instr_texts:
309+
# Process instrText to find HYPERLINK "url"
310+
for instr in instr_texts:
311+
if instr.text and "HYPERLINK" in instr.text:
312+
# Quick regex to extract URL
313+
match = re.search(r'HYPERLINK\s+"([^"]+)"', instr.text, re.IGNORECASE)
314+
if match:
315+
hyperlink_field_url = match.group(1)
316+
317+
# Process fldChar
318+
for fld_char in fld_chars:
319+
fld_char_type = fld_char.get(qn("w:fldCharType"))
320+
if fld_char_type == "begin":
321+
# Start of a field: reset legacy link state
322+
hyperlink_field_url = None
323+
hyperlink_field_text_parts = []
324+
is_collecting_field_text = False
325+
elif fld_char_type == "separate":
326+
# Separator: if we found a URL, start collecting visible text
327+
if hyperlink_field_url:
328+
is_collecting_field_text = True
329+
elif fld_char_type == "end":
330+
# End of field
331+
if is_collecting_field_text and hyperlink_field_url:
332+
# Create markdown link and append to main content
333+
display_text = "".join(hyperlink_field_text_parts).strip()
334+
if display_text:
335+
link_md = f"[{display_text}]({hyperlink_field_url})"
336+
paragraph_content.append(link_md)
337+
# Reset state
338+
hyperlink_field_url = None
339+
hyperlink_field_text_parts = []
340+
is_collecting_field_text = False
341+
342+
# Decide where to append content
343+
target_buffer = hyperlink_field_text_parts if is_collecting_field_text else paragraph_content
344+
process_run(run, target_buffer)
345+
elif tag == qn("w:hyperlink"):
346+
process_hyperlink(child, paragraph_content)
248347
return "".join(paragraph_content) if paragraph_content else ""
249348

250349
paragraphs = doc.paragraphs.copy()

0 commit comments

Comments
 (0)