docling-project
diff --git a/‎demo/demo_document_figure_classifier_predictor.py‎
Lines changed: 111 additions & 0 deletions b/‎demo/demo_document_figure_classifier_predictor.py‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py‎
Lines changed: 177 additions & 0 deletions b/‎docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py‎
Lines changed: 177 additions & 0 deletions
diff --git a/‎tests/test_data/figure_classifier/images/bar_chart.jpg‎
47.5 KB b/‎tests/test_data/figure_classifier/images/bar_chart.jpg‎
47.5 KB
diff --git a/‎tests/test_data/figure_classifier/images/map.jpg‎
130 KB b/‎tests/test_data/figure_classifier/images/map.jpg‎
130 KB
diff --git a/‎tests/test_document_figure_classifier.py‎
Lines changed: 98 additions & 0 deletions b/‎tests/test_document_figure_classifier.py‎
Lines changed: 98 additions & 0 deletions
@@ -0,0 +1,111 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+import argparse
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+from PIL import Image
+
+from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import DocumentFigureClassifierPredictor
+
+
+def demo(
+    logger: logging.Logger,
+    artifact_path: str,
+    device: str,
+    num_threads: int,
+    image_dir: str,
+    viz_dir: str,
+):
+    r"""
+    Apply DocumentFigureClassifierPredictor on the input image directory
+    """
+    # Create the layout predictor
+    document_figure_classifier_predictor = DocumentFigureClassifierPredictor(artifact_path, device=device, num_threads=num_threads)
+
+    image_dir = Path(image_dir)
+    images = []
+    image_names = os.listdir(image_dir)
+    image_names.sort()
+    for image_name in image_names:
+        image = Image.open(image_dir / image_name)
+        images.append(image)
+
+    t0 = time.perf_counter()
+    outputs = document_figure_classifier_predictor.predict(images)
+    total_ms = 1000 * (time.perf_counter() - t0)
+    avg_ms = (total_ms / len(image_names)) if len(image_names) > 0 else 0
+    logger.info(
+        "For {} images(ms): [total|avg] = [{:.1f}|{:.1f}]".format(
+            len(image_names), total_ms, avg_ms
+        )
+    )
+
+    for i, output in enumerate(outputs):
+        image_name = image_names[i]
+        logger.info(f"Predictions for: '{image_name}':")
+        for pred in output:
+            logger.info(f" Class '{pred[0]}' has probability {pred[1]}")
+
+
+def main(args):
+    num_threads = int(args.num_threads) if args.num_threads is not None else None
+    device = args.device.lower()
+    image_dir = args.image_dir
+    viz_dir = args.viz_dir
+
+    # Initialize logger
+    logging.basicConfig(level=logging.DEBUG)
+    logger = logging.getLogger("DocumentFigureClassifierPredictor")
+    logger.setLevel(logging.DEBUG)
+    if not logger.hasHandlers():
+        handler = logging.StreamHandler(sys.stdout)
+        formatter = logging.Formatter(
+            "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
+        )
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    # Ensure the viz dir
+    Path(viz_dir).mkdir(parents=True, exist_ok=True)
+
+    # Download models from HF
+    download_path = snapshot_download(repo_id="ds4sd/DocumentFigureClassifier", revision="v1.0.0")
+
+    # Test the figure classifier model
+    demo(logger, download_path, device, num_threads, image_dir, viz_dir)
+
+
+if __name__ == "__main__":
+    r"""
+    python -m demo.demo_document_figure_classifier_predictor -i <images_dir>
+    """
+    parser = argparse.ArgumentParser(description="Test the DocumentFigureClassifierPredictor")
+    parser.add_argument(
+        "-d", "--device", required=False, default="cpu", help="One of [cpu, cuda, mps]"
+    )
+    parser.add_argument(
+        "-n", "--num_threads", required=False, default=4, help="Number of threads"
+    )
+    parser.add_argument(
+        "-i",
+        "--image_dir",
+        required=True,
+        help="PNG images input directory",
+    )
+    parser.add_argument(
+        "-v",
+        "--viz_dir",
+        required=False,
+        default="viz/",
+        help="Directory to save prediction visualizations",
+    )
+
+    args = parser.parse_args()
+    main(args)
@@ -0,0 +1,177 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+import logging
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+from transformers import AutoConfig, AutoModelForImageClassification
+
+_log = logging.getLogger(__name__)
+
+
+class DocumentFigureClassifierPredictor:
+    r"""
+    Model for classifying document figures.
+
+    Classifies figures as 1 out of 16 possible classes.
+
+    The classes are:
+        1. "bar_chart"
+        2. "bar_code"
+        3. "chemistry_markush_structure"
+        4. "chemistry_molecular_structure"
+        5. "flow_chart"
+        6. "icon"
+        7. "line_chart"
+        8. "logo"
+        9. "map"
+        10. "other"
+        11. "pie_chart"
+        12. "qr_code"
+        13. "remote_sensing"
+        14. "screenshot"
+        15. "signature"
+        16. "stamp"
+
+    Attributes
+    ----------
+    _device : str
+        The device on which the model is loaded (e.g., 'cpu' or 'cuda').
+    _num_threads : int
+        Number of threads used for inference when running on CPU.
+    _model : EfficientNetForImageClassification
+        Pretrained EfficientNetb0 model.
+    _image_processor : EfficientNetImageProcessor
+        Processor for normalizing and preparing input images.
+    _classes: List[str]:
+        The classes used by the model.
+
+    Methods
+    -------
+    __init__(artifacts_path, device, num_threads)
+        Initializes the DocumentFigureClassifierPredictor with the specified parameters.
+    info() -> dict:
+        Retrieves configuration details of the DocumentFigureClassifierPredictor instance.
+    predict(images) -> List[List[float]]
+        The confidence scores for the classification of each image.
+    """
+
+    def __init__(
+        self,
+        artifacts_path: str,
+        device: str = "cpu",
+        num_threads: int = 4,
+    ):
+        r"""
+        Initializes the DocumentFigureClassifierPredictor.
+
+        Parameters
+        ----------
+        artifacts_path : str
+            Path to the directory containing the pretrained model files.
+        device : str, optional
+            Device to run the inference on ('cpu' or 'cuda'), by default "cpu".
+        num_threads : int, optional
+            Number of threads for CPU inference, by default 4.
+        """
+        self._device = device
+        self._num_threads = num_threads
+
+        if device == "cpu":
+            torch.set_num_threads(self._num_threads)
+
+        model = AutoModelForImageClassification.from_pretrained(artifacts_path)
+        self._model = model.to(device)
+        self._model.eval()
+
+        self._image_processor = transforms.Compose(
+            [
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.47853944, 0.4732864, 0.47434163],
+                ),
+            ]
+        )
+
+        config = AutoConfig.from_pretrained(artifacts_path)
+
+        self._classes = list(config.id2label.values())
+        self._classes.sort()
+
+        _log.debug("CodeFormulaModel settings: {}".format(self.info()))
+
+    def info(self) -> dict:
+        """
+        Retrieves configuration details of the DocumentFigureClassifierPredictor instance.
+
+        Returns
+        -------
+        dict
+            A dictionary containing configuration details such as the device,
+            the number of threads used and the classe sused by the model.
+        """
+        info = {
+            "device": self._device,
+            "num_threads": self._num_threads,
+            "classes": self._classes,
+        }
+        return info
+
+    def predict(
+        self, images: List[Union[Image.Image, np.ndarray]]
+    ) -> List[List[Tuple[str, float]]]:
+        r"""
+            Performs inference on a batch of figures.
+
+        Parameters
+        ----------
+        images : List[Union[Image.Image, np.ndarray]]
+            A list of input images for inference. Each image can either be a
+            PIL.Image.Image object or a NumPy array representing an image.
+
+        Returns
+        -------
+        List[List[Tuple[str, float]]]
+            A list of predictions for each input image. Each prediction is a list of
+            tuples representing the predicted class and confidence score:
+            - str: The predicted class name for the image.
+            - float: The confidence score associated with the predicted class,
+                ranging from 0 to 1.
+
+            The predictions for each image are sorted in descending order of confidence.
+        """
+        processed_images = []
+        for image in images:
+            if isinstance(image, Image.Image):
+                processed_images.append(image.convert("RGB"))
+            elif isinstance(image, np.ndarray):
+                processed_images.append(Image.fromarray(image).convert("RGB"))
+            else:
+                raise TypeError(
+                    "Supported input formats are PIL.Image.Image or numpy.ndarray."
+                )
+        images = processed_images
+
+        # (batch_size, 3, 224, 224)
+        images = [self._image_processor(image) for image in images]
+        images = torch.stack(images).to(self._device)
+
+        with torch.no_grad():
+            logits = self._model(images).logits  # (batch_size, num_classes)
+            probs_batch = logits.softmax(dim=1)  # (batch_size, num_classes)
+            probs_batch = probs_batch.cpu().numpy().tolist()
+
+        predictions_batch = []
+        for probs_image in probs_batch:
+            preds = [(self._classes[i], prob) for i, prob in enumerate(probs_image)]
+            preds.sort(key=lambda t: t[1], reverse=True)
+            predictions_batch.append(preds)
+
+        return predictions_batch
@@ -0,0 +1,98 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+import os
+import numpy as np
+import pytest
+from PIL import Image
+
+from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import (
+    DocumentFigureClassifierPredictor,
+)
+
+from huggingface_hub import snapshot_download
+
+
+@pytest.fixture(scope="module")
+def init() -> dict:
+    r"""
+    Initialize the testing environment
+    """
+    init = {
+        "num_threads": 1,
+        "test_imgs": [
+            {
+                "label": "bar_chart",
+                "image_path": "tests/test_data/figure_classifier/images/bar_chart.jpg",
+            },
+            {
+                "label": "map",
+                "image_path": "tests/test_data/figure_classifier/images/map.jpg",
+            },
+        ],
+        "info": {
+            "device": "auto",
+        },
+    }
+
+    # Download models from HF 
+    init["artifact_path"] = snapshot_download(
+        repo_id="ds4sd/DocumentFigureClassifier", revision="v1.0.0"
+    )
+
+    return init
+
+
+def test_figure_classifier(init: dict):
+    r"""
+    Unit test for the CodeFormulaPredictor
+    """
+    device = "cpu"
+    num_threads = 2
+
+    # Initialize LayoutPredictor
+    figure_classifier = DocumentFigureClassifierPredictor(
+        init["artifact_path"], device=device, num_threads=num_threads
+    )
+
+    # Check info
+    info = figure_classifier.info()
+    assert info["device"] == device, "Wronly set device"
+    assert info["num_threads"] == num_threads, "Wronly set number of threads"
+
+    # Unsupported input image
+    is_exception = False
+    try:
+        for _ in figure_classifier.predict(["wrong"]):
+            pass
+    except TypeError:
+        is_exception = True
+    assert is_exception
+
+    # Predict on test images, not batched
+    for d in init["test_imgs"]:
+        label = d["label"]
+        img_path = d["image_path"]
+
+        with Image.open(img_path) as img:
+
+            output = figure_classifier.predict([img])
+            predicted_class = output[0][0][0]
+
+            assert predicted_class == label
+
+            # Load images as numpy arrays
+            np_arr = np.asarray(img)
+            output = figure_classifier.predict([np_arr])
+            predicted_class = output[0][0][0]
+
+            assert predicted_class == label
+
+    # Predict on test images, batched
+    labels = [d['label'] for d in init["test_imgs"]]
+    images = [Image.open(d["image_path"]) for d in init["test_imgs"]]
+
+    outputs = figure_classifier.predict(images)
+    outputs = [output[0][0] for output in outputs]
+    assert outputs == labels