Skip to content

Commit cb09f07

Browse files
committed
docs(plugins): revamp external plugin example and usage guide
- Rewrites the example plugin section to streamline setup: - Clear install and run steps (pip install -e and main.py driver) - Clarifies enabling external plugins (CLI and programmatic) - Updates file names/paths and entry-point guidance; fixes/updates links (e.g., options filename, quick links to example package in repo) Signed-off-by: FrigaZzz <[email protected]>
1 parent 99a88aa commit cb09f07

File tree

8 files changed

+592
-705
lines changed

8 files changed

+592
-705
lines changed

docs/concepts/plugins.md

Lines changed: 0 additions & 516 deletions
Large diffs are not rendered by default.

docs/examples/plugin_tutorial.md

Lines changed: 531 additions & 0 deletions
Large diffs are not rendered by default.

docs/examples/third_party_plugins/api_usage/datamodel/pipeline_options/picture_description_api_model_with_usage.py

Lines changed: 0 additions & 35 deletions
This file was deleted.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
2+
3+
from pydantic import (
4+
AnyUrl,
5+
Field,
6+
)
7+
8+
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
9+
10+
11+
class PictureDescriptionApiOptionsWithUsage(PictureDescriptionApiOptions):
12+
"""DescriptionAnnotation."""
13+
14+
kind: ClassVar[Literal["api_usage"]] = "api_usage"

docs/examples/third_party_plugins/api_usage/datamodel/utils/api_image_request_with_usage.py

Lines changed: 21 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,20 @@
88
from PIL import Image
99
from pydantic import AnyUrl
1010

11-
from docling.datamodel.base_models import OpenAiApiResponse
11+
from docling.datamodel.base_models import OpenAiApiResponse, OpenAiResponseUsage
1212
from docling.models.utils.generation_utils import GenerationStopper
1313

1414
_log = logging.getLogger(__name__)
1515

1616

17-
def api_image_request(
17+
def api_image_request_with_usage(
1818
image: Image.Image,
1919
prompt: str,
2020
url: AnyUrl,
2121
timeout: float = 20,
2222
headers: Optional[Dict[str, str]] = None,
23-
token_extract_key: Optional[str] = None,
2423
**params,
25-
) -> Tuple[str, Optional[dict]]:
24+
) -> Tuple[str, Optional[OpenAiResponseUsage]]:
2625
"""Send an image+prompt to an OpenAI-compatible API and return (text, usage).
2726
2827
If no usage data is available, the second tuple element will be None.
@@ -38,138 +37,34 @@ def api_image_request(
3837
"type": "image_url",
3938
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
4039
},
41-
{"type": "text", "text": prompt},
40+
{
41+
"type": "text",
42+
"text": prompt,
43+
},
4244
],
4345
}
4446
]
4547

46-
payload = {"messages": messages, **params}
48+
payload = {
49+
"messages": messages,
50+
**params,
51+
}
52+
4753
headers = headers or {}
4854

49-
r = requests.post(str(url), headers=headers, json=payload, timeout=timeout)
55+
r = requests.post(
56+
str(url),
57+
headers=headers,
58+
json=payload,
59+
timeout=timeout,
60+
)
5061
if not r.ok:
5162
_log.error(f"Error calling the API. Response was {r.text}")
5263
r.raise_for_status()
5364

54-
# Try to parse JSON body
55-
try:
56-
resp_json = r.json()
57-
except Exception:
58-
api_resp = OpenAiApiResponse.model_validate_json(r.text)
59-
generated_text = api_resp.choices[0].message.content.strip()
60-
return generated_text, None
61-
62-
usage = None
63-
if isinstance(resp_json, dict):
64-
usage = resp_json.get("usage")
65-
66-
# Extract generated text using common OpenAI shapes
67-
generated_text = ""
68-
try:
69-
generated_text = resp_json["choices"][0]["message"]["content"].strip()
70-
except Exception:
71-
try:
72-
generated_text = resp_json["choices"][0].get("text", "")
73-
if isinstance(generated_text, str):
74-
generated_text = generated_text.strip()
75-
except Exception:
76-
try:
77-
api_resp = OpenAiApiResponse.model_validate_json(r.text)
78-
generated_text = api_resp.choices[0].message.content.strip()
79-
except Exception:
80-
generated_text = ""
65+
api_resp = OpenAiApiResponse.model_validate_json(r.text)
66+
generated_text = api_resp.choices[0].message.content.strip()
8167

82-
# If an explicit token_extract_key is provided and found in usage, use it
83-
if token_extract_key and isinstance(usage, dict) and token_extract_key in usage:
84-
extracted = usage.get(token_extract_key)
85-
generated_text = (
86-
str(extracted).strip() if extracted is not None else generated_text
87-
)
68+
usage = api_resp.usage if hasattr(api_resp, "usage") else None
8869

8970
return generated_text, usage
90-
91-
92-
def api_image_request_streaming(
93-
image: Image.Image,
94-
prompt: str,
95-
url: AnyUrl,
96-
*,
97-
timeout: float = 20,
98-
headers: Optional[Dict[str, str]] = None,
99-
generation_stoppers: List[GenerationStopper] = [],
100-
**params,
101-
) -> str:
102-
"""
103-
Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
104-
Parses SSE lines: 'data: {json}\n\n', terminated by 'data: [DONE]'.
105-
Accumulates text and calls stopper.should_stop(window) as chunks arrive.
106-
If stopper triggers, the HTTP connection is closed to abort server-side generation.
107-
"""
108-
img_io = BytesIO()
109-
image.save(img_io, "PNG")
110-
image_b64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
111-
112-
messages = [
113-
{
114-
"role": "user",
115-
"content": [
116-
{
117-
"type": "image_url",
118-
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
119-
},
120-
{"type": "text", "text": prompt},
121-
],
122-
}
123-
]
124-
125-
payload = {"messages": messages, "stream": True, **params}
126-
_log.debug(f"API streaming request payload: {json.dumps(payload, indent=2)}")
127-
128-
hdrs = {"Accept": "text/event-stream", **(headers or {})}
129-
if "temperature" in params:
130-
hdrs["X-Temperature"] = str(params["temperature"])
131-
132-
# Stream the HTTP response
133-
with requests.post(
134-
str(url), headers=hdrs, json=payload, timeout=timeout, stream=True
135-
) as r:
136-
if not r.ok:
137-
_log.error(
138-
f"Error calling the API {url} in streaming mode. Response was {r.text}"
139-
)
140-
r.raise_for_status()
141-
142-
full_text: List[str] = []
143-
for raw_line in r.iter_lines(decode_unicode=True):
144-
if not raw_line: # keep-alives / blank lines
145-
continue
146-
if not raw_line.startswith("data:"):
147-
# Some proxies inject comments; ignore anything not starting with 'data:'
148-
continue
149-
150-
data = raw_line[len("data:") :].strip()
151-
if data == "[DONE]":
152-
break
153-
154-
try:
155-
obj = json.loads(data)
156-
except json.JSONDecodeError:
157-
_log.debug("Skipping non-JSON SSE chunk: %r", data[:200])
158-
continue
159-
160-
try:
161-
delta = obj["choices"][0].get("delta") or {}
162-
piece = delta.get("content") or ""
163-
except (KeyError, IndexError) as e:
164-
_log.debug("Unexpected SSE chunk shape: %s", e)
165-
piece = ""
166-
167-
if piece:
168-
full_text.append(piece)
169-
for stopper in generation_stoppers:
170-
lookback = max(1, stopper.lookback_tokens())
171-
window = "".join(full_text)[-lookback:]
172-
if stopper.should_stop(window):
173-
return "".join(full_text)
174-
175-
return "".join(full_text)

docs/examples/third_party_plugins/api_usage/models/picture_description_api_model.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,35 @@
11
from collections.abc import Iterable
22
from concurrent.futures import ThreadPoolExecutor
33
from pathlib import Path
4-
from typing import List, Literal, Optional, Type, Union
4+
from typing import List, Optional, Type, Union
55

6-
from api_usage.datamodel.pipeline_options.picture_description_api_model_with_usage import (
7-
PictureDescriptionApiOptionsWithUsage,
6+
from api_usage.datamodel.utils.api_image_request_with_usage import (
7+
api_image_request_with_usage,
88
)
9-
from api_usage.datamodel.utils.api_image_request_with_usage import api_image_request
109
from docling_core.types.doc import DoclingDocument, NodeItem, PictureItem
1110
from docling_core.types.doc.document import (
12-
BaseAnnotation,
11+
DescriptionAnnotation,
1312
) # TODO: move import to docling_core.types.doc
1413
from PIL import Image
1514

1615
from docling.datamodel.accelerator_options import AcceleratorOptions
16+
from docling.datamodel.base_models import OpenAiResponseUsage
1717
from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
1818
from docling.exceptions import OperationNotAllowed
1919
from docling.models.base_model import ItemAndImageEnrichmentElement
20-
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
20+
from docling.models.picture_description_api_model import PictureDescriptionApiModel
21+
from docs.examples.third_party_plugins.api_usage.datamodel.pipeline_options.picture_description_api_options_with_usage import (
22+
PictureDescriptionApiOptionsWithUsage,
23+
)
2124

2225

23-
class DescriptionAnnotationWithUsage(BaseAnnotation):
26+
class DescriptionAnnotationWithUsage(DescriptionAnnotation):
2427
"""DescriptionAnnotation."""
2528

26-
kind: Literal["description"] = "description"
27-
text: str
28-
provenance: str
29-
token_usage: Optional[dict] = None
29+
usage: Optional[OpenAiResponseUsage] = None
3030

3131

32-
class PictureDescriptionApiModelWithUsage(PictureDescriptionBaseModel):
32+
class PictureDescriptionApiModelWithUsage(PictureDescriptionApiModel):
3333
# elements_batch_size = 4
3434

3535
@classmethod
@@ -65,14 +65,12 @@ def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
6565
# Note: technically we could make a batch request here,
6666
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
6767
def _api_request(image):
68-
# Pass token_extract_key so api_image_request can return token usage
69-
return api_image_request(
68+
return api_image_request_with_usage(
7069
image=image,
7170
prompt=self.options.prompt,
7271
url=self.options.url,
7372
timeout=self.options.timeout,
7473
headers=self.options.headers,
75-
token_extract_key=getattr(self.options, "token_extract_key", None),
7674
**self.options.params,
7775
)
7876

@@ -120,7 +118,7 @@ def __call__(
120118

121119
item.annotations.append(
122120
DescriptionAnnotationWithUsage(
123-
text=text, provenance=self.provenance, token_usage=usage
121+
text=text, provenance=self.provenance, usage=usage
124122
)
125123
)
126124
yield item

docs/examples/third_party_plugins/main.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,17 @@
1717
import os
1818
from typing import Dict
1919

20-
# Import the options class from the installed example plugin package
21-
from api_usage.datamodel.pipeline_options.picture_description_api_model_with_usage import (
22-
PictureDescriptionApiOptionsWithUsage,
23-
)
20+
from dotenv import load_dotenv
2421

22+
# Import the options class from the installed example plugin package
2523
from docling.datamodel.base_models import InputFormat
2624
from docling.datamodel.pipeline_options import PdfPipelineOptions
2725
from docling.document_converter import DocumentConverter, PdfFormatOption
26+
from docs.examples.third_party_plugins.api_usage.datamodel.pipeline_options.picture_description_api_options_with_usage import (
27+
PictureDescriptionApiOptionsWithUsage,
28+
)
29+
30+
load_dotenv()
2831

2932

3033
def main():
@@ -53,13 +56,10 @@ def main():
5356
PictureDescriptionApiOptionsWithUsage(
5457
url=url,
5558
headers=headers,
56-
params={"model": "gpt-4o-mini", "temperature": 0},
59+
params={"model": "gpt-5-mini", "temperature": 1},
5760
prompt="Describe the image clearly and concisely in a few sentences.",
5861
timeout=45.0,
5962
concurrency=2,
60-
# If your server returns token usage in a dict under 'usage', you can
61-
# extract a specific field and make it the generated text:
62-
token_extract_key="usage",
6363
)
6464
)
6565

@@ -88,11 +88,9 @@ def main():
8888
continue
8989

9090
for ann_idx, ann in enumerate(pic.annotations):
91-
token_usage = getattr(ann, "token_usage", None)
91+
usage = getattr(ann, "usage", None)
9292
ann_text = getattr(ann, "text", None)
93-
print(
94-
f" Annotation {ann_idx}: text={ann_text!r} token_usage={token_usage!r}"
95-
)
93+
print(f" Annotation {ann_idx}: text={ann_text!r} usage={usage!r}")
9694

9795

9896
if __name__ == "__main__":

mkdocs.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ nav:
111111
- 🖼️ Picture annotation:
112112
- "Annotate picture with local VLM": examples/pictures_description.ipynb
113113
- "Annotate picture with remote VLM": examples/pictures_description_api.py
114+
- 🧩 Plugins:
115+
- "Third‑party plugin tutorial": examples/plugin_tutorial.md
114116
- ✨ Enrichment development:
115117
- "Figure enrichment": examples/develop_picture_enrichment.py
116118
- "Formula enrichment": examples/develop_formula_understanding.py

0 commit comments

Comments
 (0)