docling-project
diff --git a/‎docs/concepts/plugins.md‎
Lines changed: 0 additions & 516 deletions b/‎docs/concepts/plugins.md‎
Lines changed: 0 additions & 516 deletions
diff --git a/‎docs/examples/plugin_tutorial.md‎
Lines changed: 531 additions & 0 deletions b/‎docs/examples/plugin_tutorial.md‎
Lines changed: 531 additions & 0 deletions
diff --git a/‎docs/examples/third_party_plugins/api_usage/datamodel/pipeline_options/picture_description_api_model_with_usage.py‎
Lines changed: 0 additions & 35 deletions b/‎docs/examples/third_party_plugins/api_usage/datamodel/pipeline_options/picture_description_api_model_with_usage.py‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎docs/examples/third_party_plugins/api_usage/datamodel/pipeline_options/picture_description_api_options_with_usage.py‎
Lines changed: 14 additions & 0 deletions b/‎docs/examples/third_party_plugins/api_usage/datamodel/pipeline_options/picture_description_api_options_with_usage.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/examples/third_party_plugins/api_usage/datamodel/utils/api_image_request_with_usage.py‎
Lines changed: 21 additions & 126 deletions b/‎docs/examples/third_party_plugins/api_usage/datamodel/utils/api_image_request_with_usage.py‎
Lines changed: 21 additions & 126 deletions
diff --git a/‎docs/examples/third_party_plugins/api_usage/models/picture_description_api_model.py‎
Lines changed: 14 additions & 16 deletions b/‎docs/examples/third_party_plugins/api_usage/models/picture_description_api_model.py‎
Lines changed: 14 additions & 16 deletions
diff --git a/‎docs/examples/third_party_plugins/main.py‎
Lines changed: 10 additions & 12 deletions b/‎docs/examples/third_party_plugins/main.py‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎mkdocs.yml‎
Lines changed: 2 additions & 0 deletions b/‎mkdocs.yml‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,14 @@
+from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
+
+from pydantic import (
+    AnyUrl,
+    Field,
+)
+
+from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
+
+
+class PictureDescriptionApiOptionsWithUsage(PictureDescriptionApiOptions):
+    """DescriptionAnnotation."""
+
+    kind: ClassVar[Literal["api_usage"]] = "api_usage"
@@ -8,21 +8,20 @@
 from PIL import Image
 from pydantic import AnyUrl
 
-from docling.datamodel.base_models import OpenAiApiResponse
+from docling.datamodel.base_models import OpenAiApiResponse, OpenAiResponseUsage
 from docling.models.utils.generation_utils import GenerationStopper
 
 _log = logging.getLogger(__name__)
 
 
-def api_image_request(
+def api_image_request_with_usage(
     image: Image.Image,
     prompt: str,
     url: AnyUrl,
     timeout: float = 20,
     headers: Optional[Dict[str, str]] = None,
-    token_extract_key: Optional[str] = None,
     **params,
-) -> Tuple[str, Optional[dict]]:
+) -> Tuple[str, Optional[OpenAiResponseUsage]]:
     """Send an image+prompt to an OpenAI-compatible API and return (text, usage).
 
     If no usage data is available, the second tuple element will be None.
@@ -38,138 +37,34 @@ def api_image_request(
                     "type": "image_url",
                     "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                 },
-                {"type": "text", "text": prompt},
+                {
+                    "type": "text",
+                    "text": prompt,
+                },
             ],
         }
     ]
 
-    payload = {"messages": messages, **params}
+    payload = {
+        "messages": messages,
+        **params,
+    }
+
     headers = headers or {}
 
-    r = requests.post(str(url), headers=headers, json=payload, timeout=timeout)
+    r = requests.post(
+        str(url),
+        headers=headers,
+        json=payload,
+        timeout=timeout,
+    )
     if not r.ok:
         _log.error(f"Error calling the API. Response was {r.text}")
     r.raise_for_status()
 
-    # Try to parse JSON body
-    try:
-        resp_json = r.json()
-    except Exception:
-        api_resp = OpenAiApiResponse.model_validate_json(r.text)
-        generated_text = api_resp.choices[0].message.content.strip()
-        return generated_text, None
-
-    usage = None
-    if isinstance(resp_json, dict):
-        usage = resp_json.get("usage")
-
-    # Extract generated text using common OpenAI shapes
-    generated_text = ""
-    try:
-        generated_text = resp_json["choices"][0]["message"]["content"].strip()
-    except Exception:
-        try:
-            generated_text = resp_json["choices"][0].get("text", "")
-            if isinstance(generated_text, str):
-                generated_text = generated_text.strip()
-        except Exception:
-            try:
-                api_resp = OpenAiApiResponse.model_validate_json(r.text)
-                generated_text = api_resp.choices[0].message.content.strip()
-            except Exception:
-                generated_text = ""
+    api_resp = OpenAiApiResponse.model_validate_json(r.text)
+    generated_text = api_resp.choices[0].message.content.strip()
 
-    # If an explicit token_extract_key is provided and found in usage, use it
-    if token_extract_key and isinstance(usage, dict) and token_extract_key in usage:
-        extracted = usage.get(token_extract_key)
-        generated_text = (
-            str(extracted).strip() if extracted is not None else generated_text
-        )
+    usage = api_resp.usage if hasattr(api_resp, "usage") else None
 
     return generated_text, usage
-
-
-def api_image_request_streaming(
-    image: Image.Image,
-    prompt: str,
-    url: AnyUrl,
-    *,
-    timeout: float = 20,
-    headers: Optional[Dict[str, str]] = None,
-    generation_stoppers: List[GenerationStopper] = [],
-    **params,
-) -> str:
-    """
-    Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
-    Parses SSE lines: 'data: {json}\n\n', terminated by 'data: [DONE]'.
-    Accumulates text and calls stopper.should_stop(window) as chunks arrive.
-    If stopper triggers, the HTTP connection is closed to abort server-side generation.
-    """
-    img_io = BytesIO()
-    image.save(img_io, "PNG")
-    image_b64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
-
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{image_b64}"},
-                },
-                {"type": "text", "text": prompt},
-            ],
-        }
-    ]
-
-    payload = {"messages": messages, "stream": True, **params}
-    _log.debug(f"API streaming request payload: {json.dumps(payload, indent=2)}")
-
-    hdrs = {"Accept": "text/event-stream", **(headers or {})}
-    if "temperature" in params:
-        hdrs["X-Temperature"] = str(params["temperature"])
-
-    # Stream the HTTP response
-    with requests.post(
-        str(url), headers=hdrs, json=payload, timeout=timeout, stream=True
-    ) as r:
-        if not r.ok:
-            _log.error(
-                f"Error calling the API {url} in streaming mode. Response was {r.text}"
-            )
-        r.raise_for_status()
-
-        full_text: List[str] = []
-        for raw_line in r.iter_lines(decode_unicode=True):
-            if not raw_line:  # keep-alives / blank lines
-                continue
-            if not raw_line.startswith("data:"):
-                # Some proxies inject comments; ignore anything not starting with 'data:'
-                continue
-
-            data = raw_line[len("data:") :].strip()
-            if data == "[DONE]":
-                break
-
-            try:
-                obj = json.loads(data)
-            except json.JSONDecodeError:
-                _log.debug("Skipping non-JSON SSE chunk: %r", data[:200])
-                continue
-
-            try:
-                delta = obj["choices"][0].get("delta") or {}
-                piece = delta.get("content") or ""
-            except (KeyError, IndexError) as e:
-                _log.debug("Unexpected SSE chunk shape: %s", e)
-                piece = ""
-
-            if piece:
-                full_text.append(piece)
-                for stopper in generation_stoppers:
-                    lookback = max(1, stopper.lookback_tokens())
-                    window = "".join(full_text)[-lookback:]
-                    if stopper.should_stop(window):
-                        return "".join(full_text)
-
-        return "".join(full_text)
@@ -1,35 +1,35 @@
 from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import List, Literal, Optional, Type, Union
+from typing import List, Optional, Type, Union
 
-from api_usage.datamodel.pipeline_options.picture_description_api_model_with_usage import (
-    PictureDescriptionApiOptionsWithUsage,
+from api_usage.datamodel.utils.api_image_request_with_usage import (
+    api_image_request_with_usage,
 )
-from api_usage.datamodel.utils.api_image_request_with_usage import api_image_request
 from docling_core.types.doc import DoclingDocument, NodeItem, PictureItem
 from docling_core.types.doc.document import (
-    BaseAnnotation,
+    DescriptionAnnotation,
 )  # TODO: move import to docling_core.types.doc
 from PIL import Image
 
 from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.base_models import OpenAiResponseUsage
 from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
 from docling.exceptions import OperationNotAllowed
 from docling.models.base_model import ItemAndImageEnrichmentElement
-from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.models.picture_description_api_model import PictureDescriptionApiModel
+from docs.examples.third_party_plugins.api_usage.datamodel.pipeline_options.picture_description_api_options_with_usage import (
+    PictureDescriptionApiOptionsWithUsage,
+)
 
 
-class DescriptionAnnotationWithUsage(BaseAnnotation):
+class DescriptionAnnotationWithUsage(DescriptionAnnotation):
     """DescriptionAnnotation."""
 
-    kind: Literal["description"] = "description"
-    text: str
-    provenance: str
-    token_usage: Optional[dict] = None
+    usage: Optional[OpenAiResponseUsage] = None
 
 
-class PictureDescriptionApiModelWithUsage(PictureDescriptionBaseModel):
+class PictureDescriptionApiModelWithUsage(PictureDescriptionApiModel):
     # elements_batch_size = 4
 
     @classmethod
@@ -65,14 +65,12 @@ def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
         # Note: technically we could make a batch request here,
         # but not all APIs will allow for it. For example, vllm won't allow more than 1.
         def _api_request(image):
-            # Pass token_extract_key so api_image_request can return token usage
-            return api_image_request(
+            return api_image_request_with_usage(
                 image=image,
                 prompt=self.options.prompt,
                 url=self.options.url,
                 timeout=self.options.timeout,
                 headers=self.options.headers,
-                token_extract_key=getattr(self.options, "token_extract_key", None),
                 **self.options.params,
             )
 
@@ -120,7 +118,7 @@ def __call__(
 
             item.annotations.append(
                 DescriptionAnnotationWithUsage(
-                    text=text, provenance=self.provenance, token_usage=usage
+                    text=text, provenance=self.provenance, usage=usage
                 )
             )
             yield item
@@ -17,14 +17,17 @@
 import os
 from typing import Dict
 
-# Import the options class from the installed example plugin package
-from api_usage.datamodel.pipeline_options.picture_description_api_model_with_usage import (
-    PictureDescriptionApiOptionsWithUsage,
-)
+from dotenv import load_dotenv
 
+# Import the options class from the installed example plugin package
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
+from docs.examples.third_party_plugins.api_usage.datamodel.pipeline_options.picture_description_api_options_with_usage import (
+    PictureDescriptionApiOptionsWithUsage,
+)
+
+load_dotenv()
 
 
 def main():
@@ -53,13 +56,10 @@ def main():
         PictureDescriptionApiOptionsWithUsage(
             url=url,
             headers=headers,
-            params={"model": "gpt-4o-mini", "temperature": 0},
+            params={"model": "gpt-5-mini", "temperature": 1},
             prompt="Describe the image clearly and concisely in a few sentences.",
             timeout=45.0,
             concurrency=2,
-            # If your server returns token usage in a dict under 'usage', you can
-            # extract a specific field and make it the generated text:
-            token_extract_key="usage",
         )
     )
 
@@ -88,11 +88,9 @@ def main():
             continue
 
         for ann_idx, ann in enumerate(pic.annotations):
-            token_usage = getattr(ann, "token_usage", None)
+            usage = getattr(ann, "usage", None)
             ann_text = getattr(ann, "text", None)
-            print(
-                f"  Annotation {ann_idx}: text={ann_text!r} token_usage={token_usage!r}"
-            )
+            print(f"  Annotation {ann_idx}: text={ann_text!r} usage={usage!r}")
 
 
 if __name__ == "__main__":
 
@@ -111,6 +111,8 @@ nav:
     - 🖼️ Picture annotation:
       - "Annotate picture with local VLM": examples/pictures_description.ipynb
       - "Annotate picture with remote VLM": examples/pictures_description_api.py
+    - 🧩 Plugins:
+      - "Third‑party plugin tutorial": examples/plugin_tutorial.md
     - ✨ Enrichment development:
       - "Figure enrichment": examples/develop_picture_enrichment.py
       - "Formula enrichment": examples/develop_formula_understanding.py