docling-project
diff --git a/‎.github/workflows/checks.yml‎
Lines changed: 6 additions & 12 deletions b/‎.github/workflows/checks.yml‎
Lines changed: 6 additions & 12 deletions
diff --git a/‎docling/backend/docx/drawingml/utils.py‎
Lines changed: 131 additions & 0 deletions b/‎docling/backend/docx/drawingml/utils.py‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎docling/backend/msword_backend.py‎
Lines changed: 87 additions & 12 deletions b/‎docling/backend/msword_backend.py‎
Lines changed: 87 additions & 12 deletions
diff --git a/‎tests/data/docx/drawingml.docx‎
40.3 KB b/‎tests/data/docx/drawingml.docx‎
40.3 KB
diff --git a/‎tests/data/groundtruth/docling_v2/drawingml.docx.itxt‎
Lines changed: 13 additions & 0 deletions b/‎tests/data/groundtruth/docling_v2/drawingml.docx.itxt‎
Lines changed: 13 additions & 0 deletions
@@ -80,10 +80,8 @@ jobs:
 
         - name: Install System Dependencies
           run: |
-            if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
-              sudo apt-get -qq update
-            fi
-            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
+            sudo apt-get -qq update
+            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
 
         - name: Set TESSDATA_PREFIX
           run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -149,10 +147,8 @@ jobs:
 
         - name: Install System Dependencies
           run: |
-            if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
-              sudo apt-get -qq update
-            fi
-            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
+            sudo apt-get -qq update
+            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
 
         - name: Set TESSDATA_PREFIX
           run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -223,10 +219,8 @@ jobs:
 
         - name: Install System Dependencies
           run: |
-            if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
-              sudo apt-get -qq update
-            fi
-            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
+            sudo apt-get -qq update
+            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
 
         - name: Set TESSDATA_PREFIX
           run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
 
@@ -0,0 +1,131 @@
+import os
+import shutil
+import subprocess
+from pathlib import Path
+from tempfile import mkdtemp
+from typing import Callable, Optional
+
+import pypdfium2
+from docx.document import Document
+from PIL import Image, ImageChops
+
+
+def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
+    """Return the libreoffice cmd and optionally test it."""
+
+    libreoffice_cmd = (
+        shutil.which("libreoffice")
+        or shutil.which("soffice")
+        or (
+            "/Applications/LibreOffice.app/Contents/MacOS/soffice"
+            if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
+            else None
+        )
+    )
+
+    if raise_if_unavailable:
+        if libreoffice_cmd is None:
+            raise RuntimeError("Libreoffice not found")
+
+        # The following test will raise if the libreoffice_cmd cannot be used
+        subprocess.run(
+            [
+                libreoffice_cmd,
+                "-h",
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            check=True,
+        )
+
+    return libreoffice_cmd
+
+
+def get_docx_to_pdf_converter() -> Optional[Callable]:
+    """
+    Detects the best available DOCX to PDF tool and returns a conversion function.
+    The returned function accepts (input_path, output_path).
+    Returns None if no tool is available.
+    """
+
+    # Try LibreOffice
+    libreoffice_cmd = get_libreoffice_cmd()
+
+    if libreoffice_cmd:
+
+        def convert_with_libreoffice(input_path, output_path):
+            subprocess.run(
+                [
+                    libreoffice_cmd,
+                    "--headless",
+                    "--convert-to",
+                    "pdf",
+                    "--outdir",
+                    os.path.dirname(output_path),
+                    input_path,
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True,
+            )
+
+            expected_output = os.path.join(
+                os.path.dirname(output_path),
+                os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
+            )
+            if expected_output != output_path:
+                os.rename(expected_output, output_path)
+
+        return convert_with_libreoffice
+
+    ## Space for other DOCX to PDF converters if available
+
+    # No tools found
+    return None
+
+
+def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
+    if bg_color is None:
+        bg_color = image.getpixel((0, 0))
+
+    bg = Image.new(image.mode, image.size, bg_color)
+    diff = ImageChops.difference(image, bg)
+    bbox = diff.getbbox()
+
+    if bbox:
+        left, upper, right, lower = bbox
+        left = max(0, left - padding)
+        upper = max(0, upper - padding)
+        right = min(image.width, right + padding)
+        lower = min(image.height, lower + padding)
+        return image.crop((left, upper, right, lower))
+    else:
+        return image
+
+
+def get_pil_from_dml_docx(
+    docx: Document, converter: Optional[Callable]
+) -> Optional[Image.Image]:
+    if converter is None:
+        return None
+
+    temp_dir = Path(mkdtemp())
+    temp_docx = Path(temp_dir / "drawing_only.docx")
+    temp_pdf = Path(temp_dir / "drawing_only.pdf")
+
+    # 1) Save docx temporarily
+    docx.save(str(temp_docx))
+
+    # 2) Export to PDF
+    converter(temp_docx, temp_pdf)
+
+    # 3) Load PDF as PNG
+    pdf = pypdfium2.PdfDocument(temp_pdf)
+    page = pdf[0]
+    image = crop_whitespace(page.render(scale=2).to_pil())
+    page.close()
+    pdf.close()
+
+    shutil.rmtree(temp_dir, ignore_errors=True)
+
+    return image
@@ -1,8 +1,9 @@
 import logging
 import re
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 
 from docling_core.types.doc import (
     DocItemLabel,
@@ -33,6 +34,11 @@
 from typing_extensions import override
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.docx.drawingml.utils import (
+    get_docx_to_pdf_converter,
+    get_libreoffice_cmd,
+    get_pil_from_dml_docx,
+)
 from docling.backend.docx.latex.omml import oMath2Latex
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@@ -64,6 +70,9 @@ def __init__(
         self.equation_bookends: str = "<eq>{EQ}</eq>"
         # Track processed textbox elements to avoid duplication
         self.processed_textbox_elements: List[int] = []
+        self.docx_to_pdf_converter: Optional[Callable] = None
+        self.docx_to_pdf_converter_init = False
+        self.display_drawingml_warning = True
 
         for i in range(-1, self.max_levels):
             self.parents[i] = None
@@ -80,18 +89,11 @@ def __init__(
             "indents": [None],
         }
 
-        self.docx_obj = None
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.docx_obj = Document(self.path_or_stream)
-            elif isinstance(self.path_or_stream, Path):
-                self.docx_obj = Document(str(self.path_or_stream))
-
+        self.docx_obj = self.load_msword_file(
+            path_or_stream=self.path_or_stream, document_hash=self.document_hash
+        )
+        if self.docx_obj:
             self.valid = True
-        except Exception as e:
-            raise RuntimeError(
-                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
-            ) from e
 
     @override
     def is_valid(self) -> bool:
@@ -139,6 +141,22 @@ def convert(self) -> DoclingDocument:
                 f"Cannot convert doc with {self.document_hash} because the backend failed to init."
             )
 
+    @staticmethod
+    def load_msword_file(
+        path_or_stream: Union[BytesIO, Path], document_hash: str
+    ) -> DocxDocument:
+        try:
+            if isinstance(path_or_stream, BytesIO):
+                return Document(path_or_stream)
+            elif isinstance(path_or_stream, Path):
+                return Document(str(path_or_stream))
+            else:
+                return None
+        except Exception as e:
+            raise RuntimeError(
+                f"MsWordDocumentBackend could not load document with hash {document_hash}"
+            ) from e
+
     def _update_history(
         self,
         name: str,
@@ -195,6 +213,7 @@ def _walk_linear(
             }
             xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
             drawing_blip = xpath_expr(element)
+            drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
 
             # Check for textbox content - check multiple textbox formats
             # Only process if the element hasn't been processed before
@@ -274,6 +293,26 @@ def _walk_linear(
                 ):
                     te1 = self._handle_text_elements(element, docx_obj, doc)
                     added_elements.extend(te1)
+            # Check for DrawingML elements
+            elif drawingml_els:
+                if (
+                    self.docx_to_pdf_converter is None
+                    and self.docx_to_pdf_converter_init is False
+                ):
+                    self.docx_to_pdf_converter = get_docx_to_pdf_converter()
+                    self.docx_to_pdf_converter_init = True
+
+                if self.docx_to_pdf_converter is None:
+                    if self.display_drawingml_warning:
+                        if self.docx_to_pdf_converter is None:
+                            _log.warning(
+                                "Found DrawingML elements in document, but no DOCX to PDF converters. "
+                                "If you want these exported, make sure you have "
+                                "LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
+                            )
+                            self.display_drawingml_warning = False
+                else:
+                    self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
             # Check for the sdt containers, like table of contents
             elif tag_name in ["sdt"]:
                 sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -1381,3 +1420,39 @@ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
                 )
                 elem_ref.append(p3.get_ref())
         return elem_ref
+
+    def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
+        # 1) Make an empty copy of the original document
+        dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
+        body = dml_doc._element.body
+        for child in list(body):
+            body.remove(child)
+
+        # 2) Add DrawingML to empty document
+        new_para = dml_doc.add_paragraph()
+        new_r = new_para.add_run()
+        for dml in drawingml_els:
+            new_r._r.append(deepcopy(dml))
+
+        # 3) Export DOCX->PDF->PNG and save it in DoclingDocument
+        level = self._get_level()
+        try:
+            pil_image = get_pil_from_dml_docx(
+                dml_doc, converter=self.docx_to_pdf_converter
+            )
+            if pil_image is None:
+                raise UnidentifiedImageError
+
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                image=ImageRef.from_pil(image=pil_image, dpi=72),
+                caption=None,
+            )
+        except (UnidentifiedImageError, OSError):
+            _log.warning("Warning: DrawingML image cannot be loaded by Pillow")
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                caption=None,
+            )
+
+        return
@@ -0,0 +1,13 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group textbox
+    item-2 at level 2: text: Text 2
+    item-3 at level 2: text: Text 1
+  item-4 at level 1: picture
+  item-5 at level 1: text: 
+  item-6 at level 1: text: 
+  item-7 at level 1: text: 
+  item-8 at level 1: text: 
+  item-9 at level 1: text: 
+  item-10 at level 1: text: 
+  item-11 at level 1: text: 
+  item-12 at level 1: picture