Skip to content

Commit 5aabcf9

Browse files
authored
Merge pull request #43 from SAFEHR-data/paul/restructure-marker-paddle
Restructure `marker` and `paddleocr` packages
2 parents 2a36054 + ac7303b commit 5aabcf9

File tree

25 files changed

+419
-418
lines changed

25 files changed

+419
-418
lines changed

docker-compose.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,21 @@ services:
5050
marker:
5151
profiles: [marker]
5252
build:
53-
context: src/ocr/marker
53+
context: packages/ocr/marker
5454
dockerfile: Dockerfile
5555
args:
5656
<<: *build-args-common
5757
MARKER_API_PORT: ${MARKER_API_PORT}
5858
environment:
5959
<<: [*proxy-common, *common-env]
60-
CONTAINER_DATA_FOLDER: /data
60+
DATA_FOLDER: /data
6161
MARKER_API_PORT: ${MARKER_API_PORT}
6262
env_file:
6363
- ./.env
6464
ports:
6565
- "${MARKER_API_PORT}:${MARKER_API_PORT}"
6666
volumes:
67-
- ${HOST_DATA_FOLDER}:${CONTAINER_DATA_FOLDER:-/data}
67+
- ${PWD}/${DATA_FOLDER}:/data
6868
networks:
6969
- pyonb_ocr_api
7070
healthcheck:
@@ -84,21 +84,21 @@ services:
8484
paddleocr:
8585
profiles: [paddleocr]
8686
build:
87-
context: src/ocr/paddleocr
87+
context: packages/ocr/paddleocr
8888
dockerfile: Dockerfile
8989
args:
9090
<<: *build-args-common
9191
PADDLEOCR_API_PORT: ${PADDLEOCR_API_PORT}
9292
environment:
9393
<<: [*proxy-common, *common-env]
94-
CONTAINER_DATA_FOLDER: /data
94+
DATA_FOLDER: /data
9595
PADDLEOCR_API_PORT: ${PADDLEOCR_API_PORT}
9696
env_file:
9797
- ./.env
9898
ports:
9999
- "${PADDLEOCR_API_PORT}:${PADDLEOCR_API_PORT}"
100100
volumes:
101-
- ${HOST_DATA_FOLDER}:${CONTAINER_DATA_FOLDER:-/data}
101+
- ${PWD}/${DATA_FOLDER}:/data
102102
networks:
103103
- pyonb_ocr_api
104104
healthcheck:

packages/ocr/marker/Dockerfile

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS app
2+
3+
SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
4+
5+
WORKDIR /app
6+
ENV PYTHONDONTWRITEBYTECODE=1
7+
ENV PYTHONUNBUFFERED=1
8+
9+
COPY ./pyproject.toml .
10+
COPY ./README.md .
11+
COPY ./src src/
12+
13+
RUN uv venv
14+
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked uv sync --no-editable --no-dev
15+
16+
# make uvicorn etc available
17+
ENV PATH="/app/.venv/bin:$PATH"
18+
19+
CMD uvicorn pyonb_marker.api:app --host 0.0.0.0 --port "$MARKER_API_PORT" --workers 4 --use-colors

packages/ocr/marker/README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Instructions
2+
3+
## Python
4+
5+
First install `pyonb_marker`. From the top-level `pyonb` directory:
6+
7+
```shell
8+
uv sync --extra marker
9+
```
10+
11+
Then, to convert a PDF to markdown:
12+
13+
```python
14+
import pyonb_marker
15+
16+
result = pyonb_marker.convert_pdf_to_markdown(
17+
filepath="path/to/data/input.pdf",
18+
)
19+
```
20+
21+
## Docker compose
22+
23+
From the `pyonb/packages/ocr/marker` directory:
24+
25+
```shell
26+
docker compose run marker data/ms-note-one-page.pdf data/output.md
27+
```
28+
29+
Note, you will need to set `DATA_FOLDER` in a `.env` file,
30+
e.g.: `DATA_FOLDER=path/to/data/input.pdf`.

packages/ocr/marker/pyproject.toml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[build-system]
2+
build-backend = "hatchling.build"
3+
requires = ["hatchling"]
4+
5+
[project]
6+
dependencies = [
7+
"accelerate",
8+
"fastapi[standard]",
9+
"marker-pdf",
10+
"ollama",
11+
"python-dotenv",
12+
"requests",
13+
"uvicorn",
14+
]
15+
description = "pyonb wrapper around marker"
16+
name = "pyonb-marker"
17+
readme = "README.md"
18+
requires-python = ">=3.11"
19+
version = "0.1.0"

src/ocr/marker/__init__.py renamed to packages/ocr/marker/src/pyonb_marker/__init__.py

File renamed without changes.

src/ocr/marker/api.py renamed to packages/ocr/marker/src/pyonb_marker/api.py

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88
from fastapi import FastAPI, File, HTTPException, UploadFile, status
99
from fastapi.responses import JSONResponse, RedirectResponse
1010

11+
from pyonb_marker.main import convert_pdf_to_markdown
12+
13+
_today = datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d") # type: ignore[attr-defined] # mypy complains that 'Module has no attribute "UTC"'
1114
logging.basicConfig(
12-
filename="marker." + datetime.datetime.now(tz=datetime.UTC).strftime("%Y%m%d") + ".log",
15+
filename=f"marker-{_today}.log",
1316
format="%(asctime)s %(message)s",
1417
filemode="a",
1518
)
@@ -18,18 +21,6 @@
1821
logger = logging.getLogger()
1922
logger.setLevel(logging.DEBUG)
2023

21-
# TODO(tom): improve imports - below try statements horrible
22-
try:
23-
# local
24-
from .main import run_marker
25-
except Exception:
26-
logger.exception("Detected inside Docker container.")
27-
# Docker container
28-
try:
29-
from main import run_marker # type: ignore # noqa: PGH003
30-
except Exception:
31-
logger.exception("Marker imports not possible.")
32-
3324
app = FastAPI(swagger_ui_parameters={"tryItOutEnabled": True})
3425

3526

@@ -71,7 +62,7 @@ async def inference(file: Annotated[UploadFile, File()] = None) -> JSONResponse:
7162
# marker requires path to file rather than UploadFile object, so create temp copy of file
7263
with Path(f"temp_api_file_{file.filename}").open("wb") as f: # noqa: ASYNC230
7364
f.write(content)
74-
result, _ = run_marker(f"temp_api_file_{file.filename}")
65+
result = convert_pdf_to_markdown(f"temp_api_file_{file.filename}")
7566
except Exception as e:
7667
raise HTTPException(status_code=400, detail=f"Failed to run marker. Error: {e}") from e
7768
else:

src/ocr/marker/main.py renamed to packages/ocr/marker/src/pyonb_marker/main.py

Lines changed: 19 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -22,37 +22,29 @@ def setup_converter(config, config_parser) -> PdfConverter: # noqa: ANN001
2222
)
2323

2424

25-
def convert_pdf_to_markdown(file_path: str | Path, output_format: str | Path = "markdown", use_llm: bool = True): # noqa: ANN201
25+
def convert_pdf_to_markdown( # noqa: ANN201
26+
file_path: str | Path,
27+
output_format: str | Path = "markdown",
28+
use_llm: bool = True,
29+
):
2630
"""Convert the PDF to markdown using Marker and optionally use LLM for improved accuracy."""
31+
config = {
32+
"output_format": output_format,
33+
"use_llm": use_llm,
34+
"llm_service": "marker.services.ollama.OllamaService",
35+
"ollama_model": "llama3.2",
36+
"ollama_base_url": "http://localhost:11434",
37+
"disable_images": True,
38+
}
39+
config_parser = ConfigParser(config)
40+
converter = setup_converter(config_parser.generate_config_dict(), config_parser)
2741
try:
28-
# Optionally enable LLM for improved accuracy
29-
config = {
30-
"output_format": output_format,
31-
"use_llm": use_llm,
32-
"llm_service": "marker.services.ollama.OllamaService",
33-
"ollama_model": "llama3.2",
34-
"ollama_base_url": "http://localhost:11434",
35-
}
36-
config_parser = ConfigParser(config)
37-
# Create the converter with the necessary settings
38-
converter = setup_converter(config_parser.generate_config_dict(), config_parser)
39-
40-
# Process the PDF file and convert to the specified output format
4142
rendered = converter(str(file_path))
42-
43-
# Extract the text (Markdown, JSON, or HTML) from the rendered object
44-
text, _, images = text_from_rendered(rendered)
43+
text, _, _ = text_from_rendered(rendered)
4544
except Exception:
4645
logger.exception("Error processing PDF.")
47-
else:
48-
return text, images
49-
50-
51-
def run_marker(input_pdf_path: str | Path): # noqa: ANN201
52-
"""Execute marker."""
53-
res, images = convert_pdf_to_markdown(file_path=input_pdf_path, use_llm=True, output_format="json")
5446

55-
return res, images
47+
return text
5648

5749

5850
if __name__ == "__main__":
@@ -64,11 +56,11 @@ def run_marker(input_pdf_path: str | Path): # noqa: ANN201
6456
input_pdf_path = Path(sys.argv[1])
6557
output_txt_path = Path(sys.argv[2])
6658

67-
res, images = run_marker(input_pdf_path)
59+
text = convert_pdf_to_markdown(input_pdf_path)
6860

6961
try:
7062
with output_txt_path.open("w", encoding="utf-8") as f:
71-
f.write(res)
63+
f.write(text)
7264

7365
logger.info("Text extracted to %s", output_txt_path)
7466

packages/ocr/paddleocr/Dockerfile

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS app
2+
3+
SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
4+
5+
6+
RUN --mount=type=cache,target=/var/cache/apt \
7+
--mount=type=cache,target=/var/lib/apt \
8+
apt-get update \
9+
&& apt-get install -y --no-install-recommends \
10+
ccache \
11+
cmake \
12+
curl \
13+
ffmpeg \
14+
libpoppler-cpp-dev \
15+
libsm6 \
16+
libxext6 \
17+
pkg-config \
18+
poppler-utils \
19+
&& rm -rf /var/lib/apt/lists/*
20+
21+
WORKDIR /app
22+
ENV PYTHONDONTWRITEBYTECODE=1
23+
ENV PYTHONUNBUFFERED=1
24+
25+
COPY ./pyproject.toml .
26+
COPY ./README.md .
27+
COPY ./src src/
28+
29+
RUN uv venv
30+
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked uv sync --no-editable --no-dev
31+
32+
# make uvicorn etc available
33+
ENV PATH="/app/.venv/bin:$PATH"
34+
35+
CMD uvicorn pyonb_paddleocr.api:app --host 0.0.0.0 --port "$PADDLEOCR_API_PORT" --workers 4 --use-colors

packages/ocr/paddleocr/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Instructions
2+
3+
Before using the `paddleocr` API for OCR, you will need to set the `PADDLEOCR_API_PORT`
4+
environment variable in the top-level `.env` file.
5+
6+
## Docker Compose
7+
8+
You will need to define the `OCR_FORWARDING_API_PORT` in the `.env` file.
9+
10+
Then, spin up the `ocr-forwarding-api` and `kreuzberg` services:
11+
12+
```shell
13+
docker-compose --profile paddleocr up --build --detach
14+
```
15+
16+
You can then use `curl` to send a PDF to the forwarding API:
17+
18+
```shell
19+
curl -v -X POST http://127.0.0.1:8110/paddleocr/inference_single \
20+
-F "file_upload=@document.pdf" \
21+
-H "accept: application/json"
22+
```
23+
24+
Note, this assumes you have set `OCR_FORWARDING_API_PORT` to `8110`.

0 commit comments

Comments
 (0)