docling-project · dolfim-ibm · Sep 4, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 12, 2025
diff --git a/.gitignore b/.gitignore
@@ -445,4 +445,7 @@ pip-selfcheck.json
 .action-lint
 .markdown-lint
 
-cookies.txt
+cookies.txt
+
+# Examples
+/examples/splitted_pdf/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,12 +7,12 @@ repos:
       - id: ruff-format
         name: "Ruff formatter"
         args: [--config=pyproject.toml]
-        files: '^(docling_serve|tests).*\.(py|ipynb)$'
+        files: '^(docling_serve|tests|examples).*\.(py|ipynb)$'
       # Run the Ruff linter.
       - id: ruff
         name: "Ruff linter"
         args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
-        files: '^(docling_serve|tests).*\.(py|ipynb)$'
+        files: '^(docling_serve|tests|examples).*\.(py|ipynb)$'
   - repo: local
     hooks:
       - id: system

diff --git a/docs/examples.md b/docs/examples.md
@@ -0,0 +1,22 @@
+# Examples
+
+## Split processing
+
+The example of provided of split processing demonstrates how to split a PDF into chunks of pages and send them for conversion. At the end, it concatenates all split pages into a single conversion `JSON`.
+
+At beginning of file there's variables to be used (and modified) such as:
+| Variable | Description |
+| ---------|-------------|
+| `path_to_pdf`| Path to PDF file to be split |
+| `pages_per_file`| The number of pages per chunk to split PDF |
+| `base_url`| Base url of the `docling-serve` host |
+| `out_dir`| The output folder of each conversion `JSON` of split PDF and the final concatenated `JSON` |
+
+The example follows the following logic:
+- Get the number of pages of the `PDF`
+- Based on the number of chunks of pages, send each chunk to conversion using `page_range` parameter
+- Wait all conversions to finish
+- Get all conversion results
+- Save each conversion `JSON` result into a `JSON` file
+- Concatenate all `JSONs` into a single `JSON` using `docling` concatenate method
+- Save concatenated `JSON` into a `JSON` file
diff --git a/examples/split_processing.py b/examples/split_processing.py
@@ -0,0 +1,124 @@
+import json
+import time
+from pathlib import Path
+
+import httpx
+from pydantic import BaseModel
+from pypdf import PdfReader
+
+from docling_core.types.doc.document import DoclingDocument
+
+# Variables to use
+path_to_pdf = Path("./tests/2206.01062v1.pdf")
+pages_per_file = 4
+base_url = "http://localhost:5001/v1"
+out_dir = Path("examples/splitted_pdf/")
+
+
+class ConvertedSplittedPdf(BaseModel):
+    task_id: str
+    conversion_finished: bool = False
+    result: dict | None = None
+
+
+def get_task_result(task_id: str):
+    response = httpx.get(
+        f"{base_url}/result/{task_id}",
+        timeout=15,
+    )
+    return response.json()
+
+
+def check_task_status(task_id: str):
+    response = httpx.get(f"{base_url}/status/poll/{task_id}", timeout=15)
+    task = response.json()
+    task_status = task["task_status"]
+
+    task_finished = False
+    if task_status == "success":
+        task_finished = True
+
+    if task_status in ("failure", "revoked"):
+        raise RuntimeError("A conversion failed")
+
+    time.sleep(5)
+
+    return task_finished
+
+
+def post_file(file_path: Path, start_page: int, end_page: int):
+    payload = {
+        "to_formats": ["json"],
+        "image_export_mode": "placeholder",
+        "ocr": False,
+        "abort_on_error": False,
+        "page_range": [start_page, end_page],
+    }
+
+    files = {
+        "files": (file_path.name, file_path.open("rb"), "application/pdf"),
+    }
+    response = httpx.post(
+        f"{base_url}/convert/file/async",
+        files=files,
+        data=payload,
+        timeout=15,
+    )
+
+    task = response.json()
+
+    return task["task_id"]
+
+
+def main():
+    filename = path_to_pdf
+
+    splitted_pdfs: list[ConvertedSplittedPdf] = []
+
+    with open(filename, "rb") as input_pdf_file:
+        pdf_reader = PdfReader(input_pdf_file)
+        total_pages = len(pdf_reader.pages)
+
+        for start_page in range(0, total_pages, pages_per_file):
+            task_id = post_file(
+                filename, start_page + 1, min(start_page + pages_per_file, total_pages)
+            )
+            splitted_pdfs.append(ConvertedSplittedPdf(task_id=task_id))
+
+    all_files_converted = False
+    while not all_files_converted:
+        found_conversion_running = False
+        for splitted_pdf in splitted_pdfs:
+            if not splitted_pdf.conversion_finished:
+                found_conversion_running = True
+                print("checking conversion status...")
+                splitted_pdf.conversion_finished = check_task_status(
+                    splitted_pdf.task_id
+                )
+        if not found_conversion_running:
+            all_files_converted = True
+
+    for splitted_pdf in splitted_pdfs:
+        splitted_pdf.result = get_task_result(splitted_pdf.task_id)
+
+    files = []
+    for i, splitted_pdf in enumerate(splitted_pdfs):
+        json_content = json.dumps(
+            splitted_pdf.result.get("document").get("json_content"), indent=2
+        )
+        doc = DoclingDocument.model_validate_json(json_content)
+        filename = f"{out_dir}/splited_json_{i}.json"
+        doc.save_as_json(filename=filename)
+        files.append(filename)
+
+    docs = [DoclingDocument.load_from_json(filename=f) for f in files]
+    concate_doc = DoclingDocument.concatenate(docs=docs)
+
+    exp_json_file = Path(f"{out_dir}/concatenated.json")
+    concate_doc.save_as_json(exp_json_file)
+
+    print("Finished")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ classifiers = [
 requires-python = ">=3.10"
 dependencies = [
     "docling~=2.38",
-    "docling-core>=2.44.1",
+    "docling-core>=2.45.0",
     "docling-jobkit[kfp,rq,vlm]>=1.4.0,<2.0.0",
     "fastapi[standard]~=0.115",
     "httpx~=0.28",
@@ -69,6 +69,7 @@ dev = [
     "asgi-lifespan~=2.0",
     "mypy~=1.11",
     "pre-commit-uv~=4.1",
+    "pypdf>=6.0.0",
     "pytest~=8.3",
     "pytest-asyncio~=0.24",
     "pytest-check~=2.4",