feat: Refactor the LayoutPredictor to support all layout models (#121)

nikos-livathinos · web-flow · commit 505fbf4841e3 · 2025-07-09T10:09:54.000+02:00
Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;
diff --git a/demo/demo_layout_predictor.py b/demo/demo_layout_predictor.py
@@ -8,7 +8,7 @@
 import sys
 import time
 from pathlib import Path
-
+from typing import Any, Dict, List
 import numpy as np
 import torch
 from huggingface_hub import snapshot_download
@@ -17,7 +17,9 @@
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 
 
-def save_predictions(prefix: str, viz_dir: str, img_fn: str, img, predictions: dict):
+def save_predictions(
+    prefix: str, viz_dir: str, img_fn: Path, img, predictions: List[Dict[str, Any]]
+):
     img_path = Path(img_fn)
 
     image = img.copy()
@@ -37,7 +39,7 @@ def save_predictions(prefix: str, viz_dir: str, img_fn: str, img, predictions: d
             confidence = round(pred["confidence"], 3)
 
             # Save the predictions in txt file
-            pred_txt = f"{prefix} {img_fn}: {label} - {bbox} - {confidence}\n"
+            pred_txt = f"{prefix} {str(img_fn)}: {label} - {bbox} - {confidence}\n"
             fd.write(pred_txt)
 
             # Draw the bbox and label
@@ -59,6 +61,7 @@ def demo(
     num_threads: int,
     img_dir: str,
     viz_dir: str,
+    threshold: float,
 ):
     r"""
     Apply LayoutPredictor on the input image directory
@@ -67,7 +70,7 @@ def demo(
     pdf_image = pyvips.Image.new_from_file("test_data/ADS.2007.page_123.pdf", page=0)
     """
     # Create the layout predictor
-    lpredictor = LayoutPredictor(artifact_path, device=device, num_threads=num_threads)
+    predictor = LayoutPredictor(artifact_path, device=device, num_threads=num_threads, base_threshold=threshold)
 
     # Predict all test png images
     t0 = time.perf_counter()
@@ -79,7 +82,7 @@ def demo(
         with Image.open(img_fn) as image:
             # Predict layout
             img_t0 = time.perf_counter()
-            preds = list(lpredictor.predict(image))
+            preds: List[Dict[str, Any]] = list(predictor.predict(image))
             img_ms = 1000 * (time.perf_counter() - img_t0)
             logger.debug("Prediction(ms): {:.2f}".format(img_ms))
 
@@ -97,10 +100,12 @@ def demo(
 
 def main(args):
     r""" """
-    num_threads = int(args.num_threads) if args.num_threads is not None else None
+    num_threads = int(args.num_threads) if args.num_threads is not None else 4
     device = args.device.lower()
     img_dir = args.img_dir
     viz_dir = args.viz_dir
+    hugging_face_repo = args.hugging_face_repo
+    threshold = float(args.threshold)
 
     # Initialize logger
     logging.basicConfig(level=logging.DEBUG)
@@ -118,20 +123,36 @@ def main(args):
     Path(viz_dir).mkdir(parents=True, exist_ok=True)
 
     # Download models from HF
-    download_path = snapshot_download(
-        repo_id="ds4sd/docling-models", revision="v2.1.0"
-    )
-    artifact_path = os.path.join(download_path, "model_artifacts/layout")
+    download_path = snapshot_download(repo_id=hugging_face_repo)
 
     # Test the LayoutPredictor
-    demo(logger, artifact_path, device, num_threads, img_dir, viz_dir)
+    demo(logger, download_path, device, num_threads, img_dir, viz_dir, threshold)
 
 
 if __name__ == "__main__":
     r"""
     python -m demo.demo_layout_predictor -i <images_dir>
     """
     parser = argparse.ArgumentParser(description="Test the LayoutPredictor")
+
+    supported_hf_repos = [
+        "ds4sd/docling-layout-old",
+        "ds4sd/docling-layout-heron",
+        "ds4sd/docling-layout-heron-101",
+        "ds4sd/docling-layout-egret-medium",
+        "ds4sd/docling-layout-egret-large",
+        "ds4sd/docling-layout-egret-xlarge",
+    ]
+    parser.add_argument(
+        "-r",
+        "--hugging-face-repo",
+        required=False,
+        default="ds4sd/docling-layout-old",
+        help=f"The hugging face repo id: [{', '.join(supported_hf_repos)}]",
+    )
+    parser.add_argument(
+        "-t", "--threshold", required=False, default=0.3, help="Threshold for the LayoutPredictor"
+    )
     parser.add_argument(
         "-d", "--device", required=False, default="cpu", help="One of [cpu, cuda, mps]"
     )
diff --git a/docling_ibm_models/layoutmodel/labels.py b/docling_ibm_models/layoutmodel/labels.py
@@ -0,0 +1,53 @@
+from typing import Dict
+
+
+class LayoutLabels:
+    r"""Single point of reference for the layout labels"""
+
+    def __init__(self) -> None:
+        r""" """
+        # Canonical classes originating in DLNv2
+        self._canonical: Dict[int, str] = {
+            # DLNv1 and DLNv2
+            0: "Caption",
+            1: "Footnote",
+            2: "Formula",
+            3: "List-item",
+            4: "Page-footer",
+            5: "Page-header",
+            6: "Picture",
+            7: "Section-header",
+            8: "Table",
+            9: "Text",
+            10: "Title",
+            # DLNv2 only
+            11: "Document Index",
+            12: "Code",
+            13: "Checkbox-Selected",
+            14: "Checkbox-Unselected",
+            15: "Form",
+            16: "Key-Value Region",
+        }
+        self._inverse_canonical: Dict[str, int] = {
+            label: class_id for class_id, label in self._canonical.items()
+        }
+
+        # Shifted canonical classes with background in 0
+        self._shifted_canonical: Dict[int, str] = {0: "Background"}
+        for k, v in self._canonical.items():
+            self._shifted_canonical[k + 1] = v
+        self._inverse_shifted_canonical: Dict[str, int] = {
+            label: class_id for class_id, label in self._shifted_canonical.items()
+        }
+
+    def canonical_categories(self) -> Dict[int, str]:
+        return self._canonical
+
+    def canonical_to_int(self) -> Dict[str, int]:
+        return self._inverse_canonical
+
+    def shifted_canonical_categories(self) -> Dict[int, str]:
+        return self._shifted_canonical
+
+    def shifted_canonical_to_int(self) -> Dict[str, int]:
+        return self._inverse_shifted_canonical
diff --git a/docling_ibm_models/layoutmodel/layout_predictor.py b/docling_ibm_models/layoutmodel/layout_predictor.py
@@ -6,13 +6,15 @@
 import os
 import threading
 from collections.abc import Iterable
-from typing import Set, Union
+from typing import Dict, List, Set, Union
 
 import numpy as np
 import torch
-import torchvision.transforms as T
 from PIL import Image
-from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
+from torch import Tensor
+from transformers import AutoModelForObjectDetection, RTDetrImageProcessor
+
+from docling_ibm_models.layoutmodel.labels import LayoutLabels
 
 _log = logging.getLogger(__name__)
 
@@ -46,70 +48,67 @@ def __init__(
         ------
         FileNotFoundError when the model's torch file is missing
         """
-        # Initialize classes map:
-        self._classes_map = {
-            0: "background",
-            1: "Caption",
-            2: "Footnote",
-            3: "Formula",
-            4: "List-item",
-            5: "Page-footer",
-            6: "Page-header",
-            7: "Picture",
-            8: "Section-header",
-            9: "Table",
-            10: "Text",
-            11: "Title",
-            12: "Document Index",
-            13: "Code",
-            14: "Checkbox-Selected",
-            15: "Checkbox-Unselected",
-            16: "Form",
-            17: "Key-Value Region",
-        }
-
         # Blacklisted classes
         self._black_classes = blacklist_classes  # set(["Form", "Key-Value Region"])
 
+        # Canonical classes
+        self._labels = LayoutLabels()
+
         # Set basic params
         self._threshold = base_threshold  # Score threshold
-        self._image_size = 640
-        self._size = np.asarray([[self._image_size, self._image_size]], dtype=np.int64)
 
         # Set number of threads for CPU
         self._device = torch.device(device)
         self._num_threads = num_threads
         if device == "cpu":
             torch.set_num_threads(self._num_threads)
 
-        # Model file and configurations
+        # Load model file and configurations
+        self._processor_config = os.path.join(artifact_path, "preprocessor_config.json")
+        self._model_config = os.path.join(artifact_path, "config.json")
         self._st_fn = os.path.join(artifact_path, "model.safetensors")
         if not os.path.isfile(self._st_fn):
             raise FileNotFoundError("Missing safe tensors file: {}".format(self._st_fn))
+        if not os.path.isfile(self._processor_config):
+            raise FileNotFoundError(
+                f"Missing processor config file: {self._processor_config}"
+            )
+        if not os.path.isfile(self._model_config):
+            raise FileNotFoundError(f"Missing model config file: {self._model_config}")
 
         # Load model and move to device
-        processor_config = os.path.join(artifact_path, "preprocessor_config.json")
-        model_config = os.path.join(artifact_path, "config.json")
-        self._image_processor = RTDetrImageProcessor.from_json_file(processor_config)
+        self._image_processor = RTDetrImageProcessor.from_json_file(
+            self._processor_config
+        )
 
         # Use lock to prevent threading issues during model initialization
         with _model_init_lock:
-            self._model = RTDetrForObjectDetection.from_pretrained(
-                artifact_path, config=model_config
+            self._model = AutoModelForObjectDetection.from_pretrained(
+                artifact_path, config=self._model_config
             ).to(self._device)
             self._model.eval()
 
+        # Set classes map
+        self._model_name = type(self._model).__name__
+        if self._model_name == "RTDetrForObjectDetection":
+            self._classes_map = self._labels.shifted_canonical_categories()
+            self._label_offset = 1
+        else:
+            self._classes_map = self._labels.canonical_categories()
+            self._label_offset = 0
+
         _log.debug("LayoutPredictor settings: {}".format(self.info()))
 
     def info(self) -> dict:
         """
         Get information about the configuration of LayoutPredictor
         """
         info = {
+            "model_name": self._model_name,
             "safe_tensors_file": self._st_fn,
             "device": self._device.type,
             "num_threads": self._num_threads,
-            "image_size": self._image_size,
+            "image_size": self._image_processor.size,
             "threshold": self._threshold,
         }
         return info
@@ -141,28 +140,27 @@ def predict(self, orig_img: Union[Image.Image, np.ndarray]) -> Iterable[dict]:
         else:
             raise TypeError("Not supported input image format")
 
-        resize = {"height": self._image_size, "width": self._image_size}
-        inputs = self._image_processor(
-            images=page_img,
-            return_tensors="pt",
-            size=resize,
-        ).to(self._device)
+        target_sizes = torch.tensor([page_img.size[::-1]])
+        inputs = self._image_processor(images=[page_img], return_tensors="pt").to(
+            self._device
+        )
         outputs = self._model(**inputs)
-        results = self._image_processor.post_process_object_detection(
-            outputs,
-            target_sizes=torch.tensor([page_img.size[::-1]]),
-            threshold=self._threshold,
+        results: List[Dict[str, Tensor]] = (
+            self._image_processor.post_process_object_detection(
+                outputs,
+                target_sizes=target_sizes,
+                threshold=self._threshold,
+            )
         )
 
         w, h = page_img.size
-
         result = results[0]
         for score, label_id, box in zip(
             result["scores"], result["labels"], result["boxes"]
         ):
             score = float(score.item())
 
-            label_id = int(label_id.item()) + 1  # Advance the label_id
+            label_id = int(label_id.item()) + self._label_offset
             label_str = self._classes_map[label_id]
 
             # Filter out blacklisted classes
diff --git a/tests/test_layout_predictor.py b/tests/test_layout_predictor.py
@@ -4,13 +4,11 @@
 #
 import os
 import json
-from pathlib import Path
 
-import torch
 import numpy as np
 import pytest
 from huggingface_hub import snapshot_download
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image
 
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 
@@ -35,8 +33,7 @@ def init() -> dict:
     }
 
     # Download models from HF
-    download_path = snapshot_download(repo_id="ds4sd/docling-models", revision="v2.1.0")
-    artifact_path = os.path.join(download_path, "model_artifacts/layout")
+    artifact_path = snapshot_download(repo_id="ds4sd/docling-layout-old")
 
     # Add the missing config keys
     init["artifact_path"] = artifact_path