Add FastAPI OCR service with Docker and CI/CD pipeline

derKater86 · claude · derKater86 · commit 4551c39e4ad8 · 2026-02-23T20:58:43.000+01:00
- FastAPI REST API with /ocr/image and /ocr/pdf endpoints
- Tesseract OCR with multi-language support and structured output (bounding boxes)
- PDF to image conversion via pdf2image/poppler
- Dockerfile based on python:3.12-slim with tesseract-ocr-all
- docker-compose.yml with port 8080 mapping
- GitHub Actions workflow to build and push to ghcr.io on main branch

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,54 @@
+name: Build & Push Docker Image
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract Docker metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=branch
+            type=sha,prefix=sha-
+            type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.12-slim
+
+# Install system dependencies: Tesseract (all language packs) + Poppler for pdf2image
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tesseract-ocr \
+    tesseract-ocr-all \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Install Python dependencies (cached layer)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY app/ ./app/
+
+EXPOSE 8000
+
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/app/__init__.py b/app/__init__.py
diff --git a/app/main.py b/app/main.py
@@ -0,0 +1,17 @@
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+
+from app.routes.ocr import router as ocr_router
+
+app = FastAPI(
+    title="OCR Everything",
+    description="REST API for OCR of images and PDFs using Tesseract.",
+    version="1.0.0",
+)
+
+app.include_router(ocr_router)
+
+
+@app.get("/health", tags=["Health"])
+async def health() -> JSONResponse:
+    return JSONResponse({"status": "ok"})
diff --git a/app/models/__init__.py b/app/models/__init__.py
diff --git a/app/models/response.py b/app/models/response.py
@@ -0,0 +1,27 @@
+from pydantic import BaseModel
+
+
+class BoundingBox(BaseModel):
+    x: int
+    y: int
+    width: int
+    height: int
+
+
+class Word(BaseModel):
+    text: str
+    confidence: float
+    bounding_box: BoundingBox
+
+
+class Page(BaseModel):
+    page_number: int
+    text: str
+    words: list[Word] | None = None
+
+
+class OCRResponse(BaseModel):
+    text: str
+    pages: list[Page]
+    language: str
+    processing_time_ms: float
diff --git a/app/routes/__init__.py b/app/routes/__init__.py
diff --git a/app/routes/ocr.py b/app/routes/ocr.py
@@ -0,0 +1,97 @@
+import io
+import time
+
+from fastapi import APIRouter, File, HTTPException, Query, UploadFile
+from PIL import Image
+
+from app.models.response import OCRResponse, Page
+from app.services.ocr_service import ocr_image
+from app.services.pdf_service import pdf_to_images
+
+router = APIRouter(prefix="/ocr", tags=["OCR"])
+
+SUPPORTED_IMAGE_TYPES = {
+    "image/png",
+    "image/jpeg",
+    "image/jpg",
+    "image/tiff",
+    "image/bmp",
+    "image/webp",
+}
+
+
+@router.post("/image", response_model=OCRResponse)
+async def ocr_image_endpoint(
+    file: UploadFile = File(...),
+    language: str = Query(default="eng", description="Tesseract language code (e.g. eng, deu, fra)"),
+    structured: bool = Query(default=False, description="Return word-level bounding boxes and confidence scores"),
+) -> OCRResponse:
+    if file.content_type not in SUPPORTED_IMAGE_TYPES:
+        raise HTTPException(
+            status_code=415,
+            detail=f"Unsupported file type '{file.content_type}'. Supported: {sorted(SUPPORTED_IMAGE_TYPES)}",
+        )
+
+    start = time.perf_counter()
+    contents = await file.read()
+
+    try:
+        image = Image.open(io.BytesIO(contents)).convert("RGB")
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Could not open image: {e}")
+
+    try:
+        page = ocr_image(image, language=language, structured=structured)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"OCR failed: {e}")
+
+    page.page_number = 1
+    elapsed_ms = (time.perf_counter() - start) * 1000
+
+    return OCRResponse(
+        text=page.text,
+        pages=[page],
+        language=language,
+        processing_time_ms=round(elapsed_ms, 2),
+    )
+
+
+@router.post("/pdf", response_model=OCRResponse)
+async def ocr_pdf_endpoint(
+    file: UploadFile = File(...),
+    language: str = Query(default="eng", description="Tesseract language code (e.g. eng, deu, fra)"),
+    structured: bool = Query(default=False, description="Return word-level bounding boxes and confidence scores"),
+    dpi: int = Query(default=200, ge=72, le=600, description="DPI for PDF rendering (higher = better quality, slower)"),
+) -> OCRResponse:
+    if file.content_type not in ("application/pdf", "application/octet-stream"):
+        raise HTTPException(status_code=415, detail="File must be a PDF (application/pdf)")
+
+    start = time.perf_counter()
+    contents = await file.read()
+
+    try:
+        images = pdf_to_images(contents, dpi=dpi)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Could not parse PDF: {e}")
+
+    if not images:
+        raise HTTPException(status_code=400, detail="PDF contains no pages")
+
+    pages: list[Page] = []
+    for i, img in enumerate(images, start=1):
+        try:
+            page = ocr_image(img, language=language, structured=structured)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"OCR failed on page {i}: {e}")
+        page.page_number = i
+        pages.append(page)
+
+    full_text = "\n\n".join(p.text for p in pages)
+    elapsed_ms = (time.perf_counter() - start) * 1000
+
+    return OCRResponse(
+        text=full_text,
+        pages=pages,
+        language=language,
+        processing_time_ms=round(elapsed_ms, 2),
+    )
diff --git a/app/services/__init__.py b/app/services/__init__.py
diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py
@@ -0,0 +1,42 @@
+import pytesseract
+from PIL import Image
+
+from app.models.response import BoundingBox, Page, Word
+
+
+def ocr_image(image: Image.Image, language: str, structured: bool) -> Page:
+    """Run Tesseract OCR on a PIL image and return a Page result."""
+    if structured:
+        data = pytesseract.image_to_data(
+            image,
+            lang=language,
+            output_type=pytesseract.Output.DICT,
+        )
+        words: list[Word] = []
+        full_text_parts: list[str] = []
+
+        for i, word_text in enumerate(data["text"]):
+            if not word_text.strip():
+                continue
+            conf = float(data["conf"][i])
+            if conf < 0:
+                continue
+            words.append(
+                Word(
+                    text=word_text,
+                    confidence=conf,
+                    bounding_box=BoundingBox(
+                        x=data["left"][i],
+                        y=data["top"][i],
+                        width=data["width"][i],
+                        height=data["height"][i],
+                    ),
+                )
+            )
+            full_text_parts.append(word_text)
+
+        text = " ".join(full_text_parts)
+        return Page(page_number=1, text=text, words=words)
+
+    text = pytesseract.image_to_string(image, lang=language)
+    return Page(page_number=1, text=text.strip())
diff --git a/app/services/pdf_service.py b/app/services/pdf_service.py
@@ -0,0 +1,9 @@
+import io
+
+from pdf2image import convert_from_bytes
+from PIL import Image
+
+
+def pdf_to_images(pdf_bytes: bytes, dpi: int = 200) -> list[Image.Image]:
+    """Convert PDF bytes to a list of PIL images, one per page."""
+    return convert_from_bytes(pdf_bytes, dpi=dpi)
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,10 @@
+services:
+  ocr-api:
+    build: .
+    ports:
+      - "8080:8000"
+    volumes:
+      - ./app:/app/app
+    restart: unless-stopped
+    environment:
+      - PYTHONUNBUFFERED=1
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.115.8
+uvicorn[standard]==0.34.0
+python-multipart==0.0.20
+pytesseract==0.3.13
+pdf2image==1.17.0
+Pillow==11.1.0
+pydantic==2.10.6