Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ LOG_LEVEL=INFO

HOST_DATA_FOLDER=
CONTAINER_DATA_FOLDER="/data"
DATA_FOLDER=

http_proxy=
https_proxy=
Expand All @@ -16,6 +17,7 @@ MARKER_API_PORT=
SPARROW_API_PORT=8001 # hard-coded in sparrow API
DOCLING_API_PORT=
PADDLEOCR_API_PORT=
KREUZBERG_API_PORT=

# Source PostgreSQL instance
POSTGRES_SOURCE_USER=postgres
Expand Down
34 changes: 34 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,40 @@ services:
retries: 3
start_period: 30s

kreuzberg:
profiles: [kreuzberg]
build:
context: packages/ocr/kreuzberg
dockerfile: Dockerfile
args:
<<: *build-args-common
KREUZBERG_API_PORT: ${KREUZBERG_API_PORT}
environment:
<<: [*proxy-common, *common-env]
DATA_FOLDER: /data
KREUZBERG_API_PORT: ${KREUZBERG_API_PORT}
env_file:
- ./.env
ports:
- "${KREUZBERG_API_PORT}:${KREUZBERG_API_PORT}"
volumes:
- ${PWD}/${DATA_FOLDER}:/data
networks:
- pyonb_ocr_api
healthcheck:
test:
[
"CMD",
"curl",
"-X",
"GET",
"http://localhost:${KREUZBERG_API_PORT}/health",
]
interval: 10s
timeout: 3s
retries: 3
start_period: 30s

ocr-forwarding-api:
build:
context: src/api
Expand Down
30 changes: 30 additions & 0 deletions packages/ocr/kreuzberg/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS app

SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]

WORKDIR /app
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1

RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libgomp1 \
libsm6 \
libxext6 \
libxrender-dev \
pandoc \
tesseract-ocr \
tesseract-ocr-eng \
&& apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

COPY ./pyproject.toml ./README.md .
COPY ./src src/

RUN uv venv
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked uv sync --no-editable --no-dev

# make uvicorn etc available
ENV PATH="/app/.venv/bin:$PATH"

CMD uvicorn pyonb_kreuzberg.api:app --host 0.0.0.0 --port "$KREUZBERG_API_PORT" --workers 4 --reload --use-colors
52 changes: 52 additions & 0 deletions packages/ocr/kreuzberg/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Instructions

Before using the `kreuzberg` API for OCR, you will need to set the `KREUZBERG_API_PORT`
environment variable in the top-level `.env` file.

## Python

First install the `kreuzberg` API. From the top-level `pyonb` directory:

```shell
uv sync -extra kreuzberg
```

Then start the `kreuzberg` API:

```shell
python src/pyonb_kreuzberg/api.py
```

You can then use `curl` to send a PDF to the API:

```shell
curl -v -X POST http://127.0.0.1:8116/extract \
-F "file_upload=@document.pdf" \
-H "accept: application/json"
```

Note, this assumes you have set `KREUZBERG_API_PORT=8116`.

Currently, this returns the response from the
[`kreuzberg` API](https://kreuzberg.dev/user-guide/api-server/#extract-files)
directly, rather than the standard `pyonb` response.

## Docker Compose

You will need to define the `OCR_FORWARDING_API_PORT` in the `.env` file.

Then, spin up the `ocr-forwarding-api` and `kreuzberg` services:

```shell
docker-compose --profile kreuzberg up --build --detach
```

You can then use `curl` to send a PDF to the forwarding API:

```shell
curl -v -X POST http://127.0.0.1:8110/kreuzberg-ocr/inference_single \
-F "file_upload=@document.pdf" \
-H "accept: application/json"
```

Note, this assumes you have set `OCR_FORWARDING_API_PORT` to `8110`.
14 changes: 14 additions & 0 deletions packages/ocr/kreuzberg/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]

[project]
dependencies = [
"kreuzberg[api]==3.13.3",
"uvicorn",
]
description = "pyonb wrapper around kreuzberg"
name = "pyonb-kreuzberg"
readme = "README.md"
requires-python = ">=3.11"
version = "0.1.0"
1 change: 1 addition & 0 deletions packages/ocr/kreuzberg/src/pyonb_kreuzberg/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package for converting PDFs to structured text using Kreuzberg OCR."""
40 changes: 40 additions & 0 deletions packages/ocr/kreuzberg/src/pyonb_kreuzberg/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""API for Kreuzberg OCR."""

import os

import uvicorn
from kreuzberg._api.main import (
KreuzbergError,
Litestar,
OpenTelemetryConfig,
OpenTelemetryPlugin,
StructLoggingConfig,
exception_handler,
general_exception_handler,
get_configuration,
handle_files_upload,
health_check,
)

KREUZBERG_API_PORT = int(os.getenv("KREUZBERG_API_PORT", default="8116"))

app = Litestar(
route_handlers=[handle_files_upload, health_check, get_configuration],
request_max_body_size=100_000_000,
plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
logging_config=StructLoggingConfig(),
exception_handlers={
KreuzbergError: exception_handler,
Exception: general_exception_handler,
},
)

if __name__ == "__main__":
uvicorn.run(
app,
host="127.0.0.1",
port=KREUZBERG_API_PORT,
workers=4,
reload=True,
use_colors=True,
)
10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ optional-dependencies = {dev = [
"mkdocs-material",
"mkdocstrings",
"mkdocstrings-python",
], kreuzberg = [
"pyonb-kreuzberg",
], test = [
"pytest",
"pytest-cov",
Expand Down Expand Up @@ -149,3 +151,11 @@ env.docs = {commands = [
gh.python."3.11" = ["py311"]
gh.python."3.12" = ["py312"]
gh.python."3.13" = ["py313"]

[tool.uv.sources]
pyonb-kreuzberg = {workspace = true}

[tool.uv.workspace]
members = [
"packages/ocr/*",
]
6 changes: 4 additions & 2 deletions src/api/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from fastapi import FastAPI, status
from fastapi.responses import JSONResponse, RedirectResponse

from .routers import docling, marker, paddleocr, sparrow
from .routers import docling, kreuzberg, marker, paddleocr, sparrow

_today = datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d") # type: ignore[attr-defined] # mypy complains that 'Module has no attribute "UTC"'
logging.basicConfig(
filename="pyonb-" + datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d") + ".log",
filename=f"pyonb-{_today}.log",
format="%(asctime)s %(message)s",
filemode="a",
)
Expand All @@ -23,6 +24,7 @@
app.include_router(marker.router)
app.include_router(paddleocr.router)
app.include_router(docling.router)
app.include_router(kreuzberg.router)


@app.get("/", include_in_schema=False)
Expand Down
84 changes: 84 additions & 0 deletions src/api/app/routers/kreuzberg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Routers for Kreuzberg OCR."""

import logging
import os
import time
from typing import Annotated, Any

import aiohttp
from fastapi import APIRouter, File, UploadFile, status
from fastapi.responses import JSONResponse

# Creating an object
logger = logging.getLogger()

router = APIRouter()

KREUZBERG_API_PORT = os.getenv("KREUZBERG_API_PORT")


@router.get("/kreuzberg/health")
async def healthcheck() -> dict[str, Any]:
"""Test aliveness endpoint for Kreuzberg."""
logger.info("[GET] /kreuzberg/health")
url = f"http://kreuzberg:{KREUZBERG_API_PORT}/health"

try:
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60 * 60)) as session: # noqa: SIM117
async with session.get(url) as response:
response.raise_for_status()
except aiohttp.ClientError:
logger.exception("Failed to connect to kreuzberg service")
raise

return JSONResponse(
status_code=status.HTTP_200_OK,
content={"service": "kreuzberg", "status": "healthy"},
)


@router.post("/kreuzberg-ocr/inference_single", status_code=status.HTTP_200_OK)
async def inference_single_doc(file_upload: Annotated[UploadFile, File()] = None) -> JSONResponse:
"""
Runs Kreuzberg OCR inference on a single document.

UploadFile object forwarded onto inference API.
"""
logger.info("[POST] /kreuzberg-ocr/extract")
url = f"http://kreuzberg:{KREUZBERG_API_PORT}/extract" # fwd request to kreuzberg service

data = aiohttp.FormData()
data.add_field(
"data", # field name expected by Kreuzberg's /extract API
file_upload.file,
filename=file_upload.filename,
content_type=file_upload.content_type,
)
headers = {"accept": "application/json"}

logger.info("post request - url: %s", url)
logger.info("post request - data: %s", data)
logger.info("post request - headers: %s", headers)

t1 = time.perf_counter()
try:
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60 * 60)) as session: # noqa: SIM117
async with session.post(url, data=data, headers=headers) as response:
response.raise_for_status()
ocr_results = await response.json()
except aiohttp.ClientError:
logger.exception("Request Exception")
raise
t2 = time.perf_counter()

# Kreuzberg's /extract API expects a list of documents and always returns a list of extracted text
# We only ever extract and return content for a single document
ocr_result = ocr_results[0]["content"]

response_json = {
"filename": str(file_upload.filename),
"duration_in_second": t2 - t1,
"ocr-result": ocr_result,
}

return JSONResponse(status_code=status.HTTP_200_OK, content=response_json)
1 change: 1 addition & 0 deletions src/api/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
fastapi[standard]
uvicorn
requests
aiohttp
2 changes: 2 additions & 0 deletions tests/.env.tests
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ LOG_LEVEL=INFO

HOST_DATA_FOLDER="/absolute/path/to/tests/data/single_synthetic_doc"
CONTAINER_DATA_FOLDER="/data"
DATA_FOLDER="tests/data/single_synthetic_doc"

http_proxy=
https_proxy=
Expand All @@ -16,6 +17,7 @@ MARKER_API_PORT=8112
SPARROW_API_PORT=8001 # hard-coded in sparrow API
PADDLEOCR_API_PORT=8114
DOCLING_API_PORT=8115
KREUZBERG_API_PORT=8116

# Source PostgreSQL instance
POSTGRES_SOURCE_USER=postgres
Expand Down
9 changes: 8 additions & 1 deletion tests/api/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@ def test_local_start_api_and_healthy(ocr_forwarding_api_port: str) -> None:

@pytest.mark.parametrize(
"check_container_healthy",
["pyonb-ocr-forwarding-api-1", "pyonb-marker-1", "pyonb-sparrow-1", "pyonb-paddleocr-1", "pyonb-docling-1"],
[
"pyonb-ocr-forwarding-api-1",
"pyonb-marker-1",
"pyonb-sparrow-1",
"pyonb-paddleocr-1",
"pyonb-docling-1",
"pyonb-kreuzberg-1",
],
indirect=True,
)
def test_check_services(check_container_healthy: bool) -> None:
Expand Down
22 changes: 22 additions & 0 deletions tests/api/test_routers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ def test_router_paddleocr(ocr_forwarding_api_port: str) -> None:
assert response.json() == {"service": "paddleocr", "status": "healthy"}


def test_router_kreuzberg(ocr_forwarding_api_port: str) -> None:
"""Test healthcheck for kreuzberg."""
response = requests.get(f"http://127.0.0.1:{ocr_forwarding_api_port}/kreuzberg/health", timeout=5)
assert response.status_code == requests.codes.ok
assert response.json() == {"service": "kreuzberg", "status": "healthy"}


def test_inference_single_file_upload_marker(ocr_forwarding_api_port: str, single_pdf_filepath: Path) -> None:
"""Test PDF conversion using marker with single file endpoint."""
url = f"http://127.0.0.1:{ocr_forwarding_api_port}/marker/inference_single"
Expand Down Expand Up @@ -104,6 +111,21 @@ def test_inference_single_file_upload_paddleocr(ocr_forwarding_api_port: str, si
assert response.json()["filename"] in single_pdf_filename


def test_inference_single_file_upload_kreuzberg(ocr_forwarding_api_port: str, single_pdf_filepath: Path) -> None:
"""Test PDF conversion using kreuzberg with single file endpoint."""
url = f"http://127.0.0.1:{ocr_forwarding_api_port}/kreuzberg-ocr/inference_single"

single_pdf_filename = single_pdf_filepath.name

with Path.open(single_pdf_filepath, "rb") as f:
files = {"file_upload": (single_pdf_filename, f, "application/pdf")}
response = requests.post(url, files=files, timeout=60 * 60)

assert response.status_code == requests.codes.ok
assert response.json()["duration_in_second"] >= 0
assert response.json()["filename"] == single_pdf_filename


def test_inference_on_folder_marker(ocr_forwarding_api_port: str) -> None:
"""
Test PDF conversion using marker pointed at a folder of files.
Expand Down
Loading
Loading