Skip to content

Commit 4551c39

Browse files
derKater86claude
andcommitted
Add FastAPI OCR service with Docker and CI/CD pipeline
- FastAPI REST API with /ocr/image and /ocr/pdf endpoints - Tesseract OCR with multi-language support and structured output (bounding boxes) - PDF to image conversion via pdf2image/poppler - Dockerfile based on python:3.12-slim with tesseract-ocr-all - docker-compose.yml with port 8080 mapping - GitHub Actions workflow to build and push to ghcr.io on main branch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 0f8edba commit 4551c39

File tree

13 files changed

+284
-0
lines changed

13 files changed

+284
-0
lines changed

.github/workflows/build.yml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
name: Build & Push Docker Image
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
branches:
9+
- main
10+
11+
env:
12+
REGISTRY: ghcr.io
13+
IMAGE_NAME: ${{ github.repository }}
14+
15+
jobs:
16+
build-and-push:
17+
runs-on: ubuntu-latest
18+
permissions:
19+
contents: read
20+
packages: write
21+
22+
steps:
23+
- name: Checkout
24+
uses: actions/checkout@v4
25+
26+
- name: Log in to GitHub Container Registry
27+
uses: docker/login-action@v3
28+
with:
29+
registry: ${{ env.REGISTRY }}
30+
username: ${{ github.actor }}
31+
password: ${{ secrets.GITHUB_TOKEN }}
32+
33+
- name: Extract Docker metadata
34+
id: meta
35+
uses: docker/metadata-action@v5
36+
with:
37+
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
38+
tags: |
39+
type=ref,event=branch
40+
type=sha,prefix=sha-
41+
type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
42+
43+
- name: Set up Docker Buildx
44+
uses: docker/setup-buildx-action@v3
45+
46+
- name: Build and push
47+
uses: docker/build-push-action@v6
48+
with:
49+
context: .
50+
push: ${{ github.event_name != 'pull_request' }}
51+
tags: ${{ steps.meta.outputs.tags }}
52+
labels: ${{ steps.meta.outputs.labels }}
53+
cache-from: type=gha
54+
cache-to: type=gha,mode=max

Dockerfile

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
FROM python:3.12-slim
2+
3+
# Install system dependencies: Tesseract (all language packs) + Poppler for pdf2image
4+
RUN apt-get update && apt-get install -y --no-install-recommends \
5+
tesseract-ocr \
6+
tesseract-ocr-all \
7+
poppler-utils \
8+
&& rm -rf /var/lib/apt/lists/*
9+
10+
WORKDIR /app
11+
12+
# Install Python dependencies (cached layer)
13+
COPY requirements.txt .
14+
RUN pip install --no-cache-dir -r requirements.txt
15+
16+
# Copy application code
17+
COPY app/ ./app/
18+
19+
EXPOSE 8000
20+
21+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

app/__init__.py

Whitespace-only changes.

app/main.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from fastapi import FastAPI
2+
from fastapi.responses import JSONResponse
3+
4+
from app.routes.ocr import router as ocr_router
5+
6+
app = FastAPI(
7+
title="OCR Everything",
8+
description="REST API for OCR of images and PDFs using Tesseract.",
9+
version="1.0.0",
10+
)
11+
12+
app.include_router(ocr_router)
13+
14+
15+
@app.get("/health", tags=["Health"])
16+
async def health() -> JSONResponse:
17+
return JSONResponse({"status": "ok"})

app/models/__init__.py

Whitespace-only changes.

app/models/response.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from pydantic import BaseModel
2+
3+
4+
class BoundingBox(BaseModel):
5+
x: int
6+
y: int
7+
width: int
8+
height: int
9+
10+
11+
class Word(BaseModel):
12+
text: str
13+
confidence: float
14+
bounding_box: BoundingBox
15+
16+
17+
class Page(BaseModel):
18+
page_number: int
19+
text: str
20+
words: list[Word] | None = None
21+
22+
23+
class OCRResponse(BaseModel):
24+
text: str
25+
pages: list[Page]
26+
language: str
27+
processing_time_ms: float

app/routes/__init__.py

Whitespace-only changes.

app/routes/ocr.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import io
2+
import time
3+
4+
from fastapi import APIRouter, File, HTTPException, Query, UploadFile
5+
from PIL import Image
6+
7+
from app.models.response import OCRResponse, Page
8+
from app.services.ocr_service import ocr_image
9+
from app.services.pdf_service import pdf_to_images
10+
11+
router = APIRouter(prefix="/ocr", tags=["OCR"])
12+
13+
SUPPORTED_IMAGE_TYPES = {
14+
"image/png",
15+
"image/jpeg",
16+
"image/jpg",
17+
"image/tiff",
18+
"image/bmp",
19+
"image/webp",
20+
}
21+
22+
23+
@router.post("/image", response_model=OCRResponse)
24+
async def ocr_image_endpoint(
25+
file: UploadFile = File(...),
26+
language: str = Query(default="eng", description="Tesseract language code (e.g. eng, deu, fra)"),
27+
structured: bool = Query(default=False, description="Return word-level bounding boxes and confidence scores"),
28+
) -> OCRResponse:
29+
if file.content_type not in SUPPORTED_IMAGE_TYPES:
30+
raise HTTPException(
31+
status_code=415,
32+
detail=f"Unsupported file type '{file.content_type}'. Supported: {sorted(SUPPORTED_IMAGE_TYPES)}",
33+
)
34+
35+
start = time.perf_counter()
36+
contents = await file.read()
37+
38+
try:
39+
image = Image.open(io.BytesIO(contents)).convert("RGB")
40+
except Exception as e:
41+
raise HTTPException(status_code=400, detail=f"Could not open image: {e}")
42+
43+
try:
44+
page = ocr_image(image, language=language, structured=structured)
45+
except Exception as e:
46+
raise HTTPException(status_code=500, detail=f"OCR failed: {e}")
47+
48+
page.page_number = 1
49+
elapsed_ms = (time.perf_counter() - start) * 1000
50+
51+
return OCRResponse(
52+
text=page.text,
53+
pages=[page],
54+
language=language,
55+
processing_time_ms=round(elapsed_ms, 2),
56+
)
57+
58+
59+
@router.post("/pdf", response_model=OCRResponse)
60+
async def ocr_pdf_endpoint(
61+
file: UploadFile = File(...),
62+
language: str = Query(default="eng", description="Tesseract language code (e.g. eng, deu, fra)"),
63+
structured: bool = Query(default=False, description="Return word-level bounding boxes and confidence scores"),
64+
dpi: int = Query(default=200, ge=72, le=600, description="DPI for PDF rendering (higher = better quality, slower)"),
65+
) -> OCRResponse:
66+
if file.content_type not in ("application/pdf", "application/octet-stream"):
67+
raise HTTPException(status_code=415, detail="File must be a PDF (application/pdf)")
68+
69+
start = time.perf_counter()
70+
contents = await file.read()
71+
72+
try:
73+
images = pdf_to_images(contents, dpi=dpi)
74+
except Exception as e:
75+
raise HTTPException(status_code=400, detail=f"Could not parse PDF: {e}")
76+
77+
if not images:
78+
raise HTTPException(status_code=400, detail="PDF contains no pages")
79+
80+
pages: list[Page] = []
81+
for i, img in enumerate(images, start=1):
82+
try:
83+
page = ocr_image(img, language=language, structured=structured)
84+
except Exception as e:
85+
raise HTTPException(status_code=500, detail=f"OCR failed on page {i}: {e}")
86+
page.page_number = i
87+
pages.append(page)
88+
89+
full_text = "\n\n".join(p.text for p in pages)
90+
elapsed_ms = (time.perf_counter() - start) * 1000
91+
92+
return OCRResponse(
93+
text=full_text,
94+
pages=pages,
95+
language=language,
96+
processing_time_ms=round(elapsed_ms, 2),
97+
)

app/services/__init__.py

Whitespace-only changes.

app/services/ocr_service.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import pytesseract
2+
from PIL import Image
3+
4+
from app.models.response import BoundingBox, Page, Word
5+
6+
7+
def ocr_image(image: Image.Image, language: str, structured: bool) -> Page:
8+
"""Run Tesseract OCR on a PIL image and return a Page result."""
9+
if structured:
10+
data = pytesseract.image_to_data(
11+
image,
12+
lang=language,
13+
output_type=pytesseract.Output.DICT,
14+
)
15+
words: list[Word] = []
16+
full_text_parts: list[str] = []
17+
18+
for i, word_text in enumerate(data["text"]):
19+
if not word_text.strip():
20+
continue
21+
conf = float(data["conf"][i])
22+
if conf < 0:
23+
continue
24+
words.append(
25+
Word(
26+
text=word_text,
27+
confidence=conf,
28+
bounding_box=BoundingBox(
29+
x=data["left"][i],
30+
y=data["top"][i],
31+
width=data["width"][i],
32+
height=data["height"][i],
33+
),
34+
)
35+
)
36+
full_text_parts.append(word_text)
37+
38+
text = " ".join(full_text_parts)
39+
return Page(page_number=1, text=text, words=words)
40+
41+
text = pytesseract.image_to_string(image, lang=language)
42+
return Page(page_number=1, text=text.strip())

0 commit comments

Comments
 (0)