docling-project
diff --git a/‎docs/concepts/plugins.md‎
Lines changed: 516 additions & 0 deletions b/‎docs/concepts/plugins.md‎
Lines changed: 516 additions & 0 deletions
diff --git a/‎docs/examples/rag_mongodb.ipynb‎
Lines changed: 0 additions & 2 deletions b/‎docs/examples/rag_mongodb.ipynb‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/examples/third_party_plugins/api_usage/api_usage_plugin.py‎
Lines changed: 7 additions & 0 deletions b/‎docs/examples/third_party_plugins/api_usage/api_usage_plugin.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/examples/third_party_plugins/api_usage/datamodel/pipeline_options/picture_description_api_model_with_usage.py‎
Lines changed: 35 additions & 0 deletions b/‎docs/examples/third_party_plugins/api_usage/datamodel/pipeline_options/picture_description_api_model_with_usage.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎docs/examples/third_party_plugins/api_usage/datamodel/utils/api_image_request_with_usage.py‎
Lines changed: 175 additions & 0 deletions b/‎docs/examples/third_party_plugins/api_usage/datamodel/utils/api_image_request_with_usage.py‎
Lines changed: 175 additions & 0 deletions
diff --git a/‎docs/examples/third_party_plugins/api_usage/models/picture_description_api_model.py‎
Lines changed: 126 additions & 0 deletions b/‎docs/examples/third_party_plugins/api_usage/models/picture_description_api_model.py‎
Lines changed: 126 additions & 0 deletions
@@ -452,8 +452,6 @@
    "source": [
     "## Part 4: Perform RAG on parsed articles\n",
     "\n",
-    "Weaviate's `generate` module allows you to perform RAG over your embedded data without having to use a separate framework.\n",
-    "\n",
     "We specify a prompt that includes the field we want to search through in the database (in this case it's `text`), a query that includes our search term, and the number of retrieved results to use in the generation."
    ]
   },
 
@@ -0,0 +1,7 @@
+from api_usage.models.picture_description_api_model import (
+    PictureDescriptionApiModelWithUsage,
+)
+
+
+def picture_description():
+    return {"picture_description": [PictureDescriptionApiModelWithUsage]}
@@ -0,0 +1,35 @@
+from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
+
+from pydantic import (
+    AnyUrl,
+    BaseModel,
+    ConfigDict,
+    Field,
+)
+
+from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
+
+
+class PictureDescriptionApiOptionsWithUsage(PictureDescriptionBaseOptions):
+    """DescriptionAnnotation."""
+
+    kind: ClassVar[Literal["api_usage"]] = "api_usage"
+
+    url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
+    headers: Dict[str, str] = {}
+    params: Dict[str, Any] = {}
+    timeout: float = 20
+    concurrency: int = 1
+
+    prompt: str = "Describe this image in a few sentences."
+    provenance: str = ""
+    # Key inside the response 'usage' (or similar) which will be used to extract
+    # the token/response text. Example: 'content' or 'text'. If None, no
+    # token extraction will be performed by default.
+    token_extract_key: Optional[str] = Field(
+        None,
+        description=(
+            "Key in the response usage dict whose value contains the token/"
+            "response to extract. For example 'content' or 'text'."
+        ),
+    )
@@ -0,0 +1,175 @@
+import base64
+import json
+import logging
+from io import BytesIO
+from typing import Dict, List, Optional, Tuple
+
+import requests
+from PIL import Image
+from pydantic import AnyUrl
+
+from docling.datamodel.base_models import OpenAiApiResponse
+from docling.models.utils.generation_utils import GenerationStopper
+
+_log = logging.getLogger(__name__)
+
+
+def api_image_request(
+    image: Image.Image,
+    prompt: str,
+    url: AnyUrl,
+    timeout: float = 20,
+    headers: Optional[Dict[str, str]] = None,
+    token_extract_key: Optional[str] = None,
+    **params,
+) -> Tuple[str, Optional[dict]]:
+    """Send an image+prompt to an OpenAI-compatible API and return (text, usage).
+
+    If no usage data is available, the second tuple element will be None.
+    """
+    img_io = BytesIO()
+    image.save(img_io, "PNG")
+    image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                },
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+
+    payload = {"messages": messages, **params}
+    headers = headers or {}
+
+    r = requests.post(str(url), headers=headers, json=payload, timeout=timeout)
+    if not r.ok:
+        _log.error(f"Error calling the API. Response was {r.text}")
+    r.raise_for_status()
+
+    # Try to parse JSON body
+    try:
+        resp_json = r.json()
+    except Exception:
+        api_resp = OpenAiApiResponse.model_validate_json(r.text)
+        generated_text = api_resp.choices[0].message.content.strip()
+        return generated_text, None
+
+    usage = None
+    if isinstance(resp_json, dict):
+        usage = resp_json.get("usage")
+
+    # Extract generated text using common OpenAI shapes
+    generated_text = ""
+    try:
+        generated_text = resp_json["choices"][0]["message"]["content"].strip()
+    except Exception:
+        try:
+            generated_text = resp_json["choices"][0].get("text", "")
+            if isinstance(generated_text, str):
+                generated_text = generated_text.strip()
+        except Exception:
+            try:
+                api_resp = OpenAiApiResponse.model_validate_json(r.text)
+                generated_text = api_resp.choices[0].message.content.strip()
+            except Exception:
+                generated_text = ""
+
+    # If an explicit token_extract_key is provided and found in usage, use it
+    if token_extract_key and isinstance(usage, dict) and token_extract_key in usage:
+        extracted = usage.get(token_extract_key)
+        generated_text = (
+            str(extracted).strip() if extracted is not None else generated_text
+        )
+
+    return generated_text, usage
+
+
+def api_image_request_streaming(
+    image: Image.Image,
+    prompt: str,
+    url: AnyUrl,
+    *,
+    timeout: float = 20,
+    headers: Optional[Dict[str, str]] = None,
+    generation_stoppers: List[GenerationStopper] = [],
+    **params,
+) -> str:
+    """
+    Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
+    Parses SSE lines: 'data: {json}\n\n', terminated by 'data: [DONE]'.
+    Accumulates text and calls stopper.should_stop(window) as chunks arrive.
+    If stopper triggers, the HTTP connection is closed to abort server-side generation.
+    """
+    img_io = BytesIO()
+    image.save(img_io, "PNG")
+    image_b64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                },
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+
+    payload = {"messages": messages, "stream": True, **params}
+    _log.debug(f"API streaming request payload: {json.dumps(payload, indent=2)}")
+
+    hdrs = {"Accept": "text/event-stream", **(headers or {})}
+    if "temperature" in params:
+        hdrs["X-Temperature"] = str(params["temperature"])
+
+    # Stream the HTTP response
+    with requests.post(
+        str(url), headers=hdrs, json=payload, timeout=timeout, stream=True
+    ) as r:
+        if not r.ok:
+            _log.error(
+                f"Error calling the API {url} in streaming mode. Response was {r.text}"
+            )
+        r.raise_for_status()
+
+        full_text: List[str] = []
+        for raw_line in r.iter_lines(decode_unicode=True):
+            if not raw_line:  # keep-alives / blank lines
+                continue
+            if not raw_line.startswith("data:"):
+                # Some proxies inject comments; ignore anything not starting with 'data:'
+                continue
+
+            data = raw_line[len("data:") :].strip()
+            if data == "[DONE]":
+                break
+
+            try:
+                obj = json.loads(data)
+            except json.JSONDecodeError:
+                _log.debug("Skipping non-JSON SSE chunk: %r", data[:200])
+                continue
+
+            try:
+                delta = obj["choices"][0].get("delta") or {}
+                piece = delta.get("content") or ""
+            except (KeyError, IndexError) as e:
+                _log.debug("Unexpected SSE chunk shape: %s", e)
+                piece = ""
+
+            if piece:
+                full_text.append(piece)
+                for stopper in generation_stoppers:
+                    lookback = max(1, stopper.lookback_tokens())
+                    window = "".join(full_text)[-lookback:]
+                    if stopper.should_stop(window):
+                        return "".join(full_text)
+
+        return "".join(full_text)
@@ -0,0 +1,126 @@
+from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import List, Literal, Optional, Type, Union
+
+from api_usage.datamodel.pipeline_options.picture_description_api_model_with_usage import (
+    PictureDescriptionApiOptionsWithUsage,
+)
+from api_usage.datamodel.utils.api_image_request_with_usage import api_image_request
+from docling_core.types.doc import DoclingDocument, NodeItem, PictureItem
+from docling_core.types.doc.document import (
+    BaseAnnotation,
+)  # TODO: move import to docling_core.types.doc
+from PIL import Image
+
+from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
+from docling.exceptions import OperationNotAllowed
+from docling.models.base_model import ItemAndImageEnrichmentElement
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+
+
+class DescriptionAnnotationWithUsage(BaseAnnotation):
+    """DescriptionAnnotation."""
+
+    kind: Literal["description"] = "description"
+    text: str
+    provenance: str
+    token_usage: Optional[dict] = None
+
+
+class PictureDescriptionApiModelWithUsage(PictureDescriptionBaseModel):
+    # elements_batch_size = 4
+
+    @classmethod
+    def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
+        return PictureDescriptionApiOptionsWithUsage
+
+    def __init__(
+        self,
+        enabled: bool,
+        enable_remote_services: bool,
+        artifacts_path: Optional[Union[Path, str]],
+        options: PictureDescriptionApiOptionsWithUsage,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            enable_remote_services=enable_remote_services,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: PictureDescriptionApiOptionsWithUsage
+        self.concurrency = self.options.concurrency
+
+        if self.enabled:
+            if not enable_remote_services:
+                raise OperationNotAllowed(
+                    "Connections to remote services is only allowed when set explicitly. "
+                    "pipeline_options.enable_remote_services=True."
+                )
+
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        # Note: technically we could make a batch request here,
+        # but not all APIs will allow for it. For example, vllm won't allow more than 1.
+        def _api_request(image):
+            # Pass token_extract_key so api_image_request can return token usage
+            return api_image_request(
+                image=image,
+                prompt=self.options.prompt,
+                url=self.options.url,
+                timeout=self.options.timeout,
+                headers=self.options.headers,
+                token_extract_key=getattr(self.options, "token_extract_key", None),
+                **self.options.params,
+            )
+
+        with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
+            yield from executor.map(_api_request, images)
+
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
+    ) -> Iterable[NodeItem]:
+        if not self.enabled:
+            for element in element_batch:
+                yield element.item
+            return
+
+        images: List[Image.Image] = []
+        elements: List[PictureItem] = []
+        for el in element_batch:
+            assert isinstance(el.item, PictureItem)
+            describe_image = True
+            # Don't describe the image if it's smaller than the threshold
+            if len(el.item.prov) > 0:
+                prov = el.item.prov[0]  # PictureItems have at most a single provenance
+                page = doc.pages.get(prov.page_no)
+                if page is not None:
+                    page_area = page.size.width * page.size.height
+                    if page_area > 0:
+                        area_fraction = prov.bbox.area() / page_area
+                        if area_fraction < self.options.picture_area_threshold:
+                            describe_image = False
+            if describe_image:
+                elements.append(el.item)
+                images.append(el.image)
+
+        outputs = self._annotate_images(images)
+
+        for item, output in zip(elements, outputs):
+            # api_image_request now may return (text, usage) or plain text;
+            # normalize to tuple
+            if isinstance(output, tuple):
+                text, usage = output
+            else:
+                text, usage = output, None
+
+            item.annotations.append(
+                DescriptionAnnotationWithUsage(
+                    text=text, provenance=self.provenance, token_usage=usage
+                )
+            )
+            yield item
Original file line number	Diff line number	Diff line change
`@@ -452,8 +452,6 @@`
`452`	`452`	`"source": [`
`453`	`453`	`"## Part 4: Perform RAG on parsed articles\n",`
`454`	`454`	`"\n",`
`455`		- "Weaviate's `generate` module allows you to perform RAG over your embedded data without having to use a separate framework.\n",
`456`		`- "\n",`
`457`	`455`	"We specify a prompt that includes the field we want to search through in the database (in this case it's `text`), a query that includes our search term, and the number of retrieved results to use in the generation."
`458`	`456`	`]`
`459`	`457`	`},`