Skip to content

Commit 1682993

Browse files
rateixeidolfim-ibm
andauthored
feat(docx): Process drawingml objects in docx (#2453)
* Export of DrawingML figures into docling document * Adding libreoffice env var and libreoffice to checks image Signed-off-by: Rafael Teixeira de Lima <[email protected]> * DCO Remediation Commit for Rafael Teixeira de Lima <[email protected]> I, Rafael Teixeira de Lima <[email protected]>, hereby add my Signed-off-by to this commit: 9518fff Signed-off-by: Rafael Teixeira de Lima <[email protected]> * Enforcing apt get update Signed-off-by: Rafael Teixeira de Lima <[email protected]> * Only display drawingml warning once per document Signed-off-by: Rafael Teixeira de Lima <[email protected]> * add util to test libreoffice and exclude files from test when not found Signed-off-by: Michele Dolfi <[email protected]> * check libreoffice only once Signed-off-by: Michele Dolfi <[email protected]> * Only initialise converter if needed Signed-off-by: Rafael Teixeira de Lima <[email protected]> --------- Signed-off-by: Rafael Teixeira de Lima <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent 3e6da2c commit 1682993

File tree

8 files changed

+512
-25
lines changed

8 files changed

+512
-25
lines changed

.github/workflows/checks.yml

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,8 @@ jobs:
8080
8181
- name: Install System Dependencies
8282
run: |
83-
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
84-
sudo apt-get -qq update
85-
fi
86-
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
83+
sudo apt-get -qq update
84+
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
8785
8886
- name: Set TESSDATA_PREFIX
8987
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -149,10 +147,8 @@ jobs:
149147
150148
- name: Install System Dependencies
151149
run: |
152-
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
153-
sudo apt-get -qq update
154-
fi
155-
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
150+
sudo apt-get -qq update
151+
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
156152
157153
- name: Set TESSDATA_PREFIX
158154
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -223,10 +219,8 @@ jobs:
223219
224220
- name: Install System Dependencies
225221
run: |
226-
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
227-
sudo apt-get -qq update
228-
fi
229-
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
222+
sudo apt-get -qq update
223+
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
230224
231225
- name: Set TESSDATA_PREFIX
232226
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import os
2+
import shutil
3+
import subprocess
4+
from pathlib import Path
5+
from tempfile import mkdtemp
6+
from typing import Callable, Optional
7+
8+
import pypdfium2
9+
from docx.document import Document
10+
from PIL import Image, ImageChops
11+
12+
13+
def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
14+
"""Return the libreoffice cmd and optionally test it."""
15+
16+
libreoffice_cmd = (
17+
shutil.which("libreoffice")
18+
or shutil.which("soffice")
19+
or (
20+
"/Applications/LibreOffice.app/Contents/MacOS/soffice"
21+
if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
22+
else None
23+
)
24+
)
25+
26+
if raise_if_unavailable:
27+
if libreoffice_cmd is None:
28+
raise RuntimeError("Libreoffice not found")
29+
30+
# The following test will raise if the libreoffice_cmd cannot be used
31+
subprocess.run(
32+
[
33+
libreoffice_cmd,
34+
"-h",
35+
],
36+
stdout=subprocess.DEVNULL,
37+
stderr=subprocess.DEVNULL,
38+
check=True,
39+
)
40+
41+
return libreoffice_cmd
42+
43+
44+
def get_docx_to_pdf_converter() -> Optional[Callable]:
45+
"""
46+
Detects the best available DOCX to PDF tool and returns a conversion function.
47+
The returned function accepts (input_path, output_path).
48+
Returns None if no tool is available.
49+
"""
50+
51+
# Try LibreOffice
52+
libreoffice_cmd = get_libreoffice_cmd()
53+
54+
if libreoffice_cmd:
55+
56+
def convert_with_libreoffice(input_path, output_path):
57+
subprocess.run(
58+
[
59+
libreoffice_cmd,
60+
"--headless",
61+
"--convert-to",
62+
"pdf",
63+
"--outdir",
64+
os.path.dirname(output_path),
65+
input_path,
66+
],
67+
stdout=subprocess.DEVNULL,
68+
stderr=subprocess.DEVNULL,
69+
check=True,
70+
)
71+
72+
expected_output = os.path.join(
73+
os.path.dirname(output_path),
74+
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
75+
)
76+
if expected_output != output_path:
77+
os.rename(expected_output, output_path)
78+
79+
return convert_with_libreoffice
80+
81+
## Space for other DOCX to PDF converters if available
82+
83+
# No tools found
84+
return None
85+
86+
87+
def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
88+
if bg_color is None:
89+
bg_color = image.getpixel((0, 0))
90+
91+
bg = Image.new(image.mode, image.size, bg_color)
92+
diff = ImageChops.difference(image, bg)
93+
bbox = diff.getbbox()
94+
95+
if bbox:
96+
left, upper, right, lower = bbox
97+
left = max(0, left - padding)
98+
upper = max(0, upper - padding)
99+
right = min(image.width, right + padding)
100+
lower = min(image.height, lower + padding)
101+
return image.crop((left, upper, right, lower))
102+
else:
103+
return image
104+
105+
106+
def get_pil_from_dml_docx(
107+
docx: Document, converter: Optional[Callable]
108+
) -> Optional[Image.Image]:
109+
if converter is None:
110+
return None
111+
112+
temp_dir = Path(mkdtemp())
113+
temp_docx = Path(temp_dir / "drawing_only.docx")
114+
temp_pdf = Path(temp_dir / "drawing_only.pdf")
115+
116+
# 1) Save docx temporarily
117+
docx.save(str(temp_docx))
118+
119+
# 2) Export to PDF
120+
converter(temp_docx, temp_pdf)
121+
122+
# 3) Load PDF as PNG
123+
pdf = pypdfium2.PdfDocument(temp_pdf)
124+
page = pdf[0]
125+
image = crop_whitespace(page.render(scale=2).to_pil())
126+
page.close()
127+
pdf.close()
128+
129+
shutil.rmtree(temp_dir, ignore_errors=True)
130+
131+
return image

docling/backend/msword_backend.py

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import logging
22
import re
3+
from copy import deepcopy
34
from io import BytesIO
45
from pathlib import Path
5-
from typing import Any, List, Optional, Union
6+
from typing import Any, Callable, List, Optional, Union
67

78
from docling_core.types.doc import (
89
DocItemLabel,
@@ -33,6 +34,11 @@
3334
from typing_extensions import override
3435

3536
from docling.backend.abstract_backend import DeclarativeDocumentBackend
37+
from docling.backend.docx.drawingml.utils import (
38+
get_docx_to_pdf_converter,
39+
get_libreoffice_cmd,
40+
get_pil_from_dml_docx,
41+
)
3642
from docling.backend.docx.latex.omml import oMath2Latex
3743
from docling.datamodel.base_models import InputFormat
3844
from docling.datamodel.document import InputDocument
@@ -64,6 +70,9 @@ def __init__(
6470
self.equation_bookends: str = "<eq>{EQ}</eq>"
6571
# Track processed textbox elements to avoid duplication
6672
self.processed_textbox_elements: List[int] = []
73+
self.docx_to_pdf_converter: Optional[Callable] = None
74+
self.docx_to_pdf_converter_init = False
75+
self.display_drawingml_warning = True
6776

6877
for i in range(-1, self.max_levels):
6978
self.parents[i] = None
@@ -80,18 +89,11 @@ def __init__(
8089
"indents": [None],
8190
}
8291

83-
self.docx_obj = None
84-
try:
85-
if isinstance(self.path_or_stream, BytesIO):
86-
self.docx_obj = Document(self.path_or_stream)
87-
elif isinstance(self.path_or_stream, Path):
88-
self.docx_obj = Document(str(self.path_or_stream))
89-
92+
self.docx_obj = self.load_msword_file(
93+
path_or_stream=self.path_or_stream, document_hash=self.document_hash
94+
)
95+
if self.docx_obj:
9096
self.valid = True
91-
except Exception as e:
92-
raise RuntimeError(
93-
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
94-
) from e
9597

9698
@override
9799
def is_valid(self) -> bool:
@@ -139,6 +141,22 @@ def convert(self) -> DoclingDocument:
139141
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
140142
)
141143

144+
@staticmethod
145+
def load_msword_file(
146+
path_or_stream: Union[BytesIO, Path], document_hash: str
147+
) -> DocxDocument:
148+
try:
149+
if isinstance(path_or_stream, BytesIO):
150+
return Document(path_or_stream)
151+
elif isinstance(path_or_stream, Path):
152+
return Document(str(path_or_stream))
153+
else:
154+
return None
155+
except Exception as e:
156+
raise RuntimeError(
157+
f"MsWordDocumentBackend could not load document with hash {document_hash}"
158+
) from e
159+
142160
def _update_history(
143161
self,
144162
name: str,
@@ -195,6 +213,7 @@ def _walk_linear(
195213
}
196214
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
197215
drawing_blip = xpath_expr(element)
216+
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
198217

199218
# Check for textbox content - check multiple textbox formats
200219
# Only process if the element hasn't been processed before
@@ -274,6 +293,26 @@ def _walk_linear(
274293
):
275294
te1 = self._handle_text_elements(element, docx_obj, doc)
276295
added_elements.extend(te1)
296+
# Check for DrawingML elements
297+
elif drawingml_els:
298+
if (
299+
self.docx_to_pdf_converter is None
300+
and self.docx_to_pdf_converter_init is False
301+
):
302+
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
303+
self.docx_to_pdf_converter_init = True
304+
305+
if self.docx_to_pdf_converter is None:
306+
if self.display_drawingml_warning:
307+
if self.docx_to_pdf_converter is None:
308+
_log.warning(
309+
"Found DrawingML elements in document, but no DOCX to PDF converters. "
310+
"If you want these exported, make sure you have "
311+
"LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
312+
)
313+
self.display_drawingml_warning = False
314+
else:
315+
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
277316
# Check for the sdt containers, like table of contents
278317
elif tag_name in ["sdt"]:
279318
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -1381,3 +1420,39 @@ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
13811420
)
13821421
elem_ref.append(p3.get_ref())
13831422
return elem_ref
1423+
1424+
def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
1425+
# 1) Make an empty copy of the original document
1426+
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
1427+
body = dml_doc._element.body
1428+
for child in list(body):
1429+
body.remove(child)
1430+
1431+
# 2) Add DrawingML to empty document
1432+
new_para = dml_doc.add_paragraph()
1433+
new_r = new_para.add_run()
1434+
for dml in drawingml_els:
1435+
new_r._r.append(deepcopy(dml))
1436+
1437+
# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
1438+
level = self._get_level()
1439+
try:
1440+
pil_image = get_pil_from_dml_docx(
1441+
dml_doc, converter=self.docx_to_pdf_converter
1442+
)
1443+
if pil_image is None:
1444+
raise UnidentifiedImageError
1445+
1446+
doc.add_picture(
1447+
parent=self.parents[level - 1],
1448+
image=ImageRef.from_pil(image=pil_image, dpi=72),
1449+
caption=None,
1450+
)
1451+
except (UnidentifiedImageError, OSError):
1452+
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
1453+
doc.add_picture(
1454+
parent=self.parents[level - 1],
1455+
caption=None,
1456+
)
1457+
1458+
return

tests/data/docx/drawingml.docx

40.3 KB
Binary file not shown.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
item-0 at level 0: unspecified: group _root_
2+
item-1 at level 1: section: group textbox
3+
item-2 at level 2: text: Text 2
4+
item-3 at level 2: text: Text 1
5+
item-4 at level 1: picture
6+
item-5 at level 1: text:
7+
item-6 at level 1: text:
8+
item-7 at level 1: text:
9+
item-8 at level 1: text:
10+
item-9 at level 1: text:
11+
item-10 at level 1: text:
12+
item-11 at level 1: text:
13+
item-12 at level 1: picture

0 commit comments

Comments
 (0)