Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,40 @@ services:
retries: 3
start_period: 30s

kreuzberg:
profiles: [kreuzberg]
build:
context: packages/ocr/kreuzberg
dockerfile: Dockerfile
args:
<<: *build-args-common
KREUZBERG_API_PORT: ${KREUZBERG_API_PORT}
environment:
<<: [*proxy-common, *common-env]
DATA_FOLDER: /data
KREUZBERG_API_PORT: ${KREUZBERG_API_PORT}
env_file:
- ./.env
ports:
- "${KREUZBERG_API_PORT}:${KREUZBERG_API_PORT}"
volumes:
- ${PWD}/${DATA_FOLDER}:/data
networks:
- pyonb_ocr_api
healthcheck:
test:
[
"CMD",
"curl",
"-X",
"GET",
"http://localhost:${KREUZBERG_API_PORT}/health",
]
interval: 10s
timeout: 3s
retries: 3
start_period: 30s

ocr-forwarding-api:
build:
context: src/api
Expand Down
18 changes: 18 additions & 0 deletions packages/ocr/kreuzberg/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS app

SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]

WORKDIR /app
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1

COPY ./pyproject.toml ./README.md .
COPY ./src src/

RUN uv venv
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked uv sync --no-editable --no-dev --compile-bytecode

# make uvicorn etc available
ENV PATH="/app/.venv/bin:$PATH"

CMD uvicorn kreuzberg._api.main:app --host 0.0.0.0 --port "$KREUZBERG_API_PORT" --workers 4 --reload --use-colors
52 changes: 52 additions & 0 deletions packages/ocr/kreuzberg/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Instructions

Before using the `kreuzberg` API for OCR, you will need to set the `KREUZBERG_API_PORT`
environment variable in the top-level `.env` file.

## Python

First install the `kreuzberg` API. From the top-level `pyonb` directory:

```shell
uv sync -extra kreuzberg
```

Then start the `kreuzberg` API:

```shell
python src/pyonb_kreuzberg/api.py
```

You can then use `curl` to send a PDF to the API:

```shell
curl -v -X POST http://127.0.0.1:8111/extract \
-F "file_upload=@document.pdf" \
-H "accept: application/json"
```

Note, this assumes you have set `KREUZBERG_API_PORT=8111`.

Currently, this returns the response from the
[`kreuzberg` API](https://kreuzberg.dev/user-guide/api-server/#extract-files)
directly, rather than the standard `pyonb` response.

## Docker Compose

You will need to define the `OCR_FORWARDING_API_PORT` in the `.env` file.

Then, spin up the `ocr-forwarding-api` and `kreuzberg` services:

```shell
docker-compose --profile kreuzberg up --build --detach
```

You can then use `curl` to send a PDF to the forwarding API:

```shell
curl -v -X POST http://127.0.0.1:8110/kreuzberg-ocr/inference_single \
-F "file_upload=@document.pdf" \
-H "accept: application/json"
```

Note, this assumes you have set `OCR_FORWARD_API_PORT` to `8110`.
14 changes: 14 additions & 0 deletions packages/ocr/kreuzberg/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]

[project]
dependencies = [
"kreuzberg[api,easyocr,paddleocr]==3.11.0",
"uvicorn",
]
description = "pyonb wrapper around kreuzberg"
name = "pyonb-kreuzberg"
readme = "README.md"
requires-python = ">=3.11"
version = "0.1.0"
1 change: 1 addition & 0 deletions packages/ocr/kreuzberg/src/pyonb_kreuzberg/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package for converting PDFs to structured text using Kreuzberg OCR."""
18 changes: 18 additions & 0 deletions packages/ocr/kreuzberg/src/pyonb_kreuzberg/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""API for Kreuzberg OCR."""

import os

import uvicorn
from kreuzberg._api.main import app

KREUZBERG_API_PORT = os.getenv("KREUZBERG_API_PORT")

if __name__ == "__main__":
uvicorn.run(
app,
host="127.0.0.1",
port=KREUZBERG_API_PORT,
workers=4,
reload=True,
use_colors=True,
)
10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ optional-dependencies = {dev = [
"mkdocs-material",
"mkdocstrings",
"mkdocstrings-python",
], kreuzberg = [
"pyonb-kreuzberg",
], test = [
"pytest",
"pytest-cov",
Expand Down Expand Up @@ -149,3 +151,11 @@ env.docs = {commands = [
gh.python."3.11" = ["py311"]
gh.python."3.12" = ["py312"]
gh.python."3.13" = ["py313"]

[tool.uv.sources]
pyonb-kreuzberg = {workspace = true}

[tool.uv.workspace]
members = [
"packages/ocr/*",
]
6 changes: 4 additions & 2 deletions src/api/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from fastapi import FastAPI, status
from fastapi.responses import JSONResponse, RedirectResponse

from .routers import docling, marker, paddleocr, sparrow
from .routers import docling, kreuzberg, marker, paddleocr, sparrow

_today = datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d") # type: ignore[attr-defined] # mypy complains that 'Module has no attribute "UTC"'
logging.basicConfig(
filename="pyonb-" + datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d") + ".log",
filename=f"pyonb-{_today}.log",
format="%(asctime)s %(message)s",
filemode="a",
)
Expand All @@ -23,6 +24,7 @@
app.include_router(marker.router)
app.include_router(paddleocr.router)
app.include_router(docling.router)
app.include_router(kreuzberg.router)


@app.get("/", include_in_schema=False)
Expand Down
84 changes: 84 additions & 0 deletions src/api/app/routers/kreuzberg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Routers for Kreuzberg OCR."""

import logging
import os
import time
from typing import Annotated, Any

import aiohttp
from fastapi import APIRouter, File, UploadFile, status
from fastapi.responses import JSONResponse

# Creating an object
logger = logging.getLogger()

router = APIRouter()

KREUZBERG_API_PORT = os.getenv("KREUZBERG_API_PORT")


@router.get("/kreuzberg/health")
async def healthcheck() -> dict[str, Any]:
"""Test aliveness endpoint for Kreuzberg."""
logger.info("[GET] /kreuzberg/health")
url = f"http://kreuzberg:{KREUZBERG_API_PORT}/health"

try:
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60 * 60)) as session: # noqa: SIM117
async with session.get(url) as response:
response.raise_for_status()
except aiohttp.ClientError:
logger.exception("Failed to connect to kreuzberg service")
raise

return JSONResponse(
status_code=status.HTTP_200_OK,
content={"service": "kreuzberg", "status": "healthy"},
)


@router.post("/kreuzberg-ocr/inference_single", status_code=status.HTTP_200_OK)
async def inference_single_doc(file_upload: Annotated[UploadFile, File()] = None) -> JSONResponse:
"""
Runs Kreuzberg OCR inference on a single document.

UploadFile object forwarded onto inference API.
"""
logger.info("[POST] /kreuzberg-ocr/extract")
url = f"http://kreuzberg:{KREUZBERG_API_PORT}/extract" # fwd request to kreuzberg service

data = aiohttp.FormData()
data.add_field(
"data", # field name expected by Kreuzberg's /extract API
file_upload.file,
filename=file_upload.filename,
content_type=file_upload.content_type,
)
headers = {"accept": "application/json"}

logger.info("post request - url: %s", url)
logger.info("post request - data: %s", data)
logger.info("post request - headers: %s", headers)

t1 = time.perf_counter()
try:
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60 * 60)) as session: # noqa: SIM117
async with session.post(url, data=data, headers=headers) as response:
response.raise_for_status()
ocr_results = await response.json()
except aiohttp.ClientError:
logger.exception("Request Exception")
raise
t2 = time.perf_counter()

# Kreuzberg's /extract API expects a list of documents and always returns a list of extracted text
# We only ever extract and return content for a single document
ocr_result = ocr_results[0]["content"]

response_json = {
"filename": str(file_upload.filename),
"duration_in_second": t2 - t1,
"ocr-result": ocr_result,
}

return JSONResponse(status_code=status.HTTP_200_OK, content=response_json)
1 change: 1 addition & 0 deletions src/api/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
fastapi[standard]
uvicorn
requests
aiohttp
1 change: 1 addition & 0 deletions tests/.env.tests
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ MARKER_API_PORT=8112
SPARROW_API_PORT=8001 # hard-coded in sparrow API
PADDLEOCR_API_PORT=8114
DOCLING_API_PORT=8115
KREUZBERG_API_PORT=8111

# Source PostgreSQL instance
POSTGRES_SOURCE_USER=postgres
Expand Down
9 changes: 8 additions & 1 deletion tests/api/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@ def test_local_start_api_and_healthy(ocr_forwarding_api_port: str) -> None:

@pytest.mark.parametrize(
"check_container_healthy",
["pyonb-ocr-forwarding-api-1", "pyonb-marker-1", "pyonb-sparrow-1", "pyonb-paddleocr-1", "pyonb-docling-1"],
[
"pyonb-ocr-forwarding-api-1",
"pyonb-marker-1",
"pyonb-sparrow-1",
"pyonb-paddleocr-1",
"pyonb-docling-1",
"pyonb-kreuzberg-1",
],
indirect=True,
)
def test_check_services(check_container_healthy: bool) -> None:
Expand Down
22 changes: 22 additions & 0 deletions tests/api/test_routers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ def test_router_paddleocr(ocr_forwarding_api_port: str) -> None:
assert response.json() == {"service": "paddleocr", "status": "healthy"}


def test_router_kreuzberg(ocr_forwarding_api_port: str) -> None:
"""Test healthcheck for kreuzberg."""
response = requests.get(f"http://127.0.0.1:{ocr_forwarding_api_port}/kreuzberg/health", timeout=5)
assert response.status_code == requests.codes.ok
assert response.json() == {"service": "kreuzberg", "status": "healthy"}


def test_inference_single_file_upload_marker(ocr_forwarding_api_port: str, single_pdf_filepath: Path) -> None:
"""Test PDF conversion using marker with single file endpoint."""
url = f"http://127.0.0.1:{ocr_forwarding_api_port}/marker/inference_single"
Expand Down Expand Up @@ -104,6 +111,21 @@ def test_inference_single_file_upload_paddleocr(ocr_forwarding_api_port: str, si
assert response.json()["filename"] in single_pdf_filename


def test_inference_single_file_upload_kreuzberg(ocr_forwarding_api_port: str, single_pdf_filepath: Path) -> None:
"""Test PDF conversion using marker with single file endpoint."""
url = f"http://127.0.0.1:{ocr_forwarding_api_port}/kreuzberg-ocr/inference_single"

single_pdf_filename = single_pdf_filepath.name

with Path.open(single_pdf_filepath, "rb") as f:
files = {"file_upload": (single_pdf_filename, f, "application/pdf")}
response = requests.post(url, files=files, timeout=60 * 60)

assert response.status_code == requests.codes.ok
assert response.json()["duration_in_second"] >= 0
assert response.json()["filename"] == single_pdf_filename


def test_inference_on_folder_marker(ocr_forwarding_api_port: str) -> None:
"""
Test PDF conversion using marker pointed at a folder of files.
Expand Down
Loading
Loading