Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -445,4 +445,7 @@ pip-selfcheck.json
.action-lint
.markdown-lint

cookies.txt
cookies.txt

# Examples
/examples/splitted_pdf/*
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ repos:
- id: ruff-format
name: "Ruff formatter"
args: [--config=pyproject.toml]
files: '^(docling_serve|tests).*\.(py|ipynb)$'
files: '^(docling_serve|tests|examples).*\.(py|ipynb)$'
# Run the Ruff linter.
- id: ruff
name: "Ruff linter"
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
files: '^(docling_serve|tests).*\.(py|ipynb)$'
files: '^(docling_serve|tests|examples).*\.(py|ipynb)$'
- repo: local
hooks:
- id: system
Expand Down
22 changes: 22 additions & 0 deletions docs/examples.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Examples

## Split processing

The example of provided of split processing demonstrates how to split a PDF into chunks of pages and send them for conversion. At the end, it concatenates all split pages into a single conversion `JSON`.

At beginning of file there's variables to be used (and modified) such as:
| Variable | Description |
| ---------|-------------|
| `path_to_pdf`| Path to PDF file to be split |
| `pages_per_file`| The number of pages per chunk to split PDF |
| `base_url`| Base url of the `docling-serve` host |
| `out_dir`| The output folder of each conversion `JSON` of split PDF and the final concatenated `JSON` |

The example follows the following logic:
- Get the number of pages of the `PDF`
- Based on the number of chunks of pages, send each chunk to conversion using `page_range` parameter
- Wait all conversions to finish
- Get all conversion results
- Save each conversion `JSON` result into a `JSON` file
- Concatenate all `JSONs` into a single `JSON` using `docling` concatenate method
- Save concatenated `JSON` into a `JSON` file
124 changes: 124 additions & 0 deletions examples/split_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import json
import time
from pathlib import Path

import httpx
from pydantic import BaseModel
from pypdf import PdfReader

from docling_core.types.doc.document import DoclingDocument

# Variables to use
path_to_pdf = Path("./tests/2206.01062v1.pdf")
pages_per_file = 4
base_url = "http://localhost:5001/v1"
out_dir = Path("examples/splitted_pdf/")


class ConvertedSplittedPdf(BaseModel):
task_id: str
conversion_finished: bool = False
result: dict | None = None


def get_task_result(task_id: str):
response = httpx.get(
f"{base_url}/result/{task_id}",
timeout=15,
)
return response.json()


def check_task_status(task_id: str):
response = httpx.get(f"{base_url}/status/poll/{task_id}", timeout=15)
task = response.json()
task_status = task["task_status"]

task_finished = False
if task_status == "success":
task_finished = True

if task_status in ("failure", "revoked"):
raise RuntimeError("A conversion failed")

time.sleep(5)

return task_finished


def post_file(file_path: Path, start_page: int, end_page: int):
payload = {
"to_formats": ["json"],
"image_export_mode": "placeholder",
"ocr": False,
"abort_on_error": False,
"page_range": [start_page, end_page],
}

files = {
"files": (file_path.name, file_path.open("rb"), "application/pdf"),
}
response = httpx.post(
f"{base_url}/convert/file/async",
files=files,
data=payload,
timeout=15,
)

task = response.json()

return task["task_id"]


def main():
filename = path_to_pdf

splitted_pdfs: list[ConvertedSplittedPdf] = []

with open(filename, "rb") as input_pdf_file:
pdf_reader = PdfReader(input_pdf_file)
total_pages = len(pdf_reader.pages)

for start_page in range(0, total_pages, pages_per_file):
task_id = post_file(
filename, start_page + 1, min(start_page + pages_per_file, total_pages)
)
splitted_pdfs.append(ConvertedSplittedPdf(task_id=task_id))

all_files_converted = False
while not all_files_converted:
found_conversion_running = False
for splitted_pdf in splitted_pdfs:
if not splitted_pdf.conversion_finished:
found_conversion_running = True
print("checking conversion status...")
splitted_pdf.conversion_finished = check_task_status(
splitted_pdf.task_id
)
if not found_conversion_running:
all_files_converted = True

for splitted_pdf in splitted_pdfs:
splitted_pdf.result = get_task_result(splitted_pdf.task_id)

files = []
for i, splitted_pdf in enumerate(splitted_pdfs):
json_content = json.dumps(
splitted_pdf.result.get("document").get("json_content"), indent=2
)
doc = DoclingDocument.model_validate_json(json_content)
filename = f"{out_dir}/splited_json_{i}.json"
doc.save_as_json(filename=filename)
files.append(filename)

docs = [DoclingDocument.load_from_json(filename=f) for f in files]
concate_doc = DoclingDocument.concatenate(docs=docs)

exp_json_file = Path(f"{out_dir}/concatenated.json")
concate_doc.save_as_json(exp_json_file)

print("Finished")


if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ classifiers = [
requires-python = ">=3.10"
dependencies = [
"docling~=2.38",
"docling-core>=2.44.1",
"docling-core>=2.45.0",
"docling-jobkit[kfp,rq,vlm]>=1.4.0,<2.0.0",
"fastapi[standard]~=0.115",
"httpx~=0.28",
Expand Down Expand Up @@ -69,6 +69,7 @@ dev = [
"asgi-lifespan~=2.0",
"mypy~=1.11",
"pre-commit-uv~=4.1",
"pypdf>=6.0.0",
"pytest~=8.3",
"pytest-asyncio~=0.24",
"pytest-check~=2.4",
Expand Down
Loading
Loading