Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ FROM pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

RUN apt-get update
RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc
RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc curl


RUN apt-get install -y ocrmypdf
Expand Down Expand Up @@ -52,4 +52,3 @@ RUN python src/download_models.py
ENV PYTHONPATH "${PYTHONPATH}:/app/src"
ENV TRANSFORMERS_VERBOSITY=error
ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1

10 changes: 10 additions & 0 deletions Dockerfile.ollama
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM ollama/ollama:latest

RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*

ENV OLLAMA_HOST=0.0.0.0:11434

EXPOSE 11434

HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:11434/api/tags || exit 1
116 changes: 111 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,35 @@
HAS_GPU := $(shell command -v nvidia-smi > /dev/null && echo 1 || echo 0)

help:
@echo "PDF Document Layout Analysis - Available Commands:"
@echo ""
@echo "📄 Standard PDF Analysis (main app only):"
@echo " make start - Auto-detects GPU, starts main app only"
@echo " make start_no_gpu - Forces CPU mode, starts main app only"
@echo " make start_detached - Background mode, main app only (CPU)"
@echo " make start_detached_gpu - Background mode, main app only (GPU)"
@echo ""
@echo "🌐 With Translation Features (includes Ollama):"
@echo " make start_translation - Auto-detects GPU, includes Ollama"
@echo " make start_translation_no_gpu - Forces CPU mode, includes Ollama"
@echo ""
@echo "🧪 Testing & Utilities:"
@echo " make test - Run Python tests"
@echo " make stop - Stop all services"
@echo ""
@echo "🔧 Development:"
@echo " make install_venv - Create virtual environment"
@echo " make install - Install dependencies"
@echo " make formatter - Format code with black"
@echo " make check_format - Check code formatting"
@echo ""
@echo "🧹 Cleanup:"
@echo " make remove_docker_containers - Remove Docker containers"
@echo " make remove_docker_images - Remove Docker images"
@echo " make free_up_space - Free up system space"
@echo ""
@echo "💡 Tip: Use 'make start' for basic PDF analysis, 'make start_translation' for translation features"

install:
. .venv/bin/activate; pip install -Ur requirements.txt

Expand Down Expand Up @@ -31,19 +61,88 @@ else
endif
ifeq ($(HAS_GPU), 1)
@echo "NVIDIA GPU detected, using docker-compose-gpu.yml"
docker compose -f docker-compose-gpu.yml up --build
docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu
else
@echo "No NVIDIA GPU detected, using docker-compose.yml"
docker compose -f docker-compose.yml up --build
docker compose -f docker-compose.yml up --build pdf-document-layout-analysis
endif


start_no_gpu:
mkdir -p ./models
docker compose up --build
@echo "Starting with CPU-only configuration"
docker compose up --build pdf-document-layout-analysis

start_translation:
ifeq ($(OS), Windows_NT)
if not exist models mkdir models
else
mkdir -p ./models
endif
ifeq ($(HAS_GPU), 1)
@echo "NVIDIA GPU detected, starting with translation support (GPU-enabled Ollama)"
@echo "Starting Ollama GPU container first..."
docker compose -f docker-compose-gpu.yml up -d ollama-gpu
@echo "Waiting for Ollama to be healthy..."
@timeout=60; while [ $$timeout -gt 0 ]; do \
if docker inspect --format='{{.State.Health.Status}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then \
echo "Ollama GPU container is healthy!"; \
break; \
fi; \
echo "Waiting for Ollama GPU container to be healthy... ($$timeout seconds remaining)"; \
sleep 5; \
timeout=$$((timeout-5)); \
done
@if ! docker inspect --format='{{.State.Health.Status}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then \
echo "Warning: Ollama GPU container may not be fully healthy yet, but continuing..."; \
fi
@echo "Starting all services with translation support..."
docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu-translation
else
@echo "No NVIDIA GPU detected, starting with translation support (CPU Ollama)"
@echo "Starting Ollama container first..."
docker compose -f docker-compose.yml up -d ollama
@echo "Waiting for Ollama to be healthy..."
@timeout=60; while [ $$timeout -gt 0 ]; do \
if docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
echo "Ollama container is healthy!"; \
break; \
fi; \
echo "Waiting for Ollama container to be healthy... ($$timeout seconds remaining)"; \
sleep 5; \
timeout=$$((timeout-5)); \
done
@if ! docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
echo "Warning: Ollama container may not be fully healthy yet, but continuing..."; \
fi
@echo "Starting all services with translation support..."
docker compose -f docker-compose.yml up --build pdf-document-layout-analysis-translation
endif

start_translation_no_gpu:
mkdir -p ./models
@echo "Starting with CPU-only configuration and translation support"
@echo "Starting Ollama container first..."
docker compose up -d ollama
@echo "Waiting for Ollama to be healthy..."
@timeout=60; while [ $$timeout -gt 0 ]; do \
if docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
echo "Ollama container is healthy!"; \
break; \
fi; \
echo "Waiting for Ollama container to be healthy... ($$timeout seconds remaining)"; \
sleep 5; \
timeout=$$((timeout-5)); \
done
@if ! docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
echo "Warning: Ollama container may not be fully healthy yet, but continuing..."; \
fi
@echo "Starting all services with translation support..."
docker compose up --build pdf-document-layout-analysis-translation

stop:
docker compose stop
docker compose -f docker-compose-gpu.yml stop

test:
. .venv/bin/activate; command cd src; command python -m pytest
Expand All @@ -68,11 +167,18 @@ free_up_space:

start_detached:
mkdir -p ./models
docker compose up --build -d
@echo "Starting in detached mode"
docker compose up --build -d pdf-document-layout-analysis
@echo "Main application started in background. Check status with: docker compose ps"
@echo "View logs with: docker compose logs -f pdf-document-layout-analysis"

start_detached_gpu:
mkdir -p ./models
RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d
@echo "Starting in detached mode with GPU"
RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d pdf-document-layout-analysis-gpu
@echo "Main application started in background. Check status with: docker compose ps"
@echo "View logs with: docker compose logs -f pdf-document-layout-analysis-gpu"


upgrade:
. .venv/bin/activate; pip-upgrade
87 changes: 77 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,23 @@ This project provides a powerful and flexible PDF analysis microservice built wi

### 1. Start the Service

**With GPU support (recommended for better performance):**
**Standard PDF Analysis (recommended for most users):**
```bash
make start
```

**Without GPU support:**
**With Translation Features (includes Ollama container):**
```bash
make start_no_gpu
make start_translation
```

The service will be available at `http://localhost:5060`

**See all available commands:**
```bash
make help
```

**Check service status:**

```bash
Expand Down Expand Up @@ -170,8 +175,8 @@ The service provides a comprehensive RESTful API with the following endpoints:

| Endpoint | Method | Description | Parameters |
|----------|--------|-------------|------------|
| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file`, `target_languages`, `translation_model` |
| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file`, `target_languages`, `translation_model` |
| `/visualize` | POST | Visualize segmentation results on the PDF | `file`, `fast` |

### OCR & Utility Endpoints
Expand All @@ -192,6 +197,8 @@ The service provides a comprehensive RESTful API with the following endpoints:
- **`types`**: Comma-separated content types to extract (string, default: "all")
- **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
- **`dpi`**: Image resolution for conversion (integer, default: 120)
- **`target_languages`**: Comma-separated list of target languages for translation (e.g. "Turkish, Spanish, French")
- **`translation_model`**: Ollama model to use for translation (string, default: "gpt-oss")

## 💡 Usage Examples

Expand Down Expand Up @@ -254,15 +261,75 @@ curl -X POST http://localhost:5060/markdown \
curl -X POST http://localhost:5060/html \
-F 'file=@document.pdf' \
-F 'extract_toc=true' \
-F 'output_file=document.html' \
-F 'output_file=document.md' \
--output 'document.zip'
```

**Convert to Markdown with Translation:**
```bash
curl -X POST http://localhost:5060/markdown \
-F 'file=@document.pdf' \
-F 'output_file=document.md' \
-F 'target_languages=Turkish, Spanish' \
-F 'translation_model=gpt-oss' \
--output 'document.zip'
```

**Convert to HTML with Translation:**
```bash
curl -X POST http://localhost:5060/html \
-F 'file=@document.pdf' \
-F 'output_file=document.md' \
-F 'target_languages=French, Russian' \
-F 'translation_model=huihui_ai/hunyuan-mt-abliterated' \
--output 'document.zip'
```

> **📋 Segmentation Data**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains a `{filename}_segmentation.json` file with information about each detected document segment including:
> - **Coordinates**: `left`, `top`, `width`, `height`
> - **Page information**: `page_number`, `page_width`, `page_height`
> - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
> **📋 Segmentation Data & Translations**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains:
> - **Original file**: The converted document in the requested format
> - **Segmentation data**: `{filename}_segmentation.json` file with information about each detected document segment:
> - **Coordinates**: `left`, `top`, `width`, `height`
> - **Page information**: `page_number`, `page_width`, `page_height`
> - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
> - **Translated files** (if `target_languages` specified): `{filename}_{language}.{extension}` for each target language
> - **Images** (if present): `{filename}_pictures/` directory containing extracted images

### Translation Features

The `/markdown` and `/html` endpoints support automatic translation of the converted content into multiple languages using Ollama models.

**Translation Requirements:**
- The specified translation model must be available in Ollama
- An `output_file` must be specified (translations are only included in zip responses)

**Supported Translation Models:**
- Any Ollama-compatible model (e.g., `gpt-oss`, `llama2`, `mistral`, etc.)
- Models are automatically downloaded if not present locally

**Translation Process:**
1. The service checks if the specified model is available in Ollama
2. If not available, it attempts to download the model using `ollama pull`
3. For each target language, the content is translated while preserving:
- Original formatting and structure
- Markdown/HTML syntax
- Links and references
- Image references and tables
4. Translated files are named: `{filename}_{language}.{extension}`

_**Note that the quality of translations mostly depends on the models used. When using smaller models, the output may contain many unexpected or undesired elements. For regular users, we aimed for a balance between performance and quality, so we tested with different models with a reasonable size. The results for `gpt-oss` were satisfactory, which is why we set it as the default model. If you need something smaller you can also try `huihui_ai/hunyuan-mt-abliterated`, we saw it gives decent results especially if the text does not have much styling.**_

**Example Translation Output:**
```
document.zip
├── document.md # Source text with markdown/html styling
├── document_Spanish.md # Spanish translation
├── document_French.md # French translation
├── document_Turkish.md # Turkish translation
├── document_segmentation.json # Segmentation information
└── document_pictures/ # (if images present)
├── document_1_1.png
└── document_1_2.png
```

### OCR Processing

Expand Down
58 changes: 55 additions & 3 deletions docker-compose-gpu.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,53 @@
services:
pdf-document-layout-analysis-gpu:
ollama-gpu:
extends:
file: docker-compose.yml
service: pdf-document-layout-analysis
service: ollama
container_name: ollama-service-gpu
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [ gpu ]
environment:
- NVIDIA_VISIBLE_DEVICES=all

pdf-document-layout-analysis-gpu:
container_name: pdf-document-layout-analysis-gpu
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
init: true
restart: unless-stopped
build:
context: .
dockerfile: Dockerfile
ports:
- "5060:5060"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [ gpu ]
environment:
- RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
- OLLAMA_HOST=http://localhost:11434

pdf-document-layout-analysis-gpu-translation:
container_name: pdf-document-layout-analysis-gpu-translation
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
init: true
restart: unless-stopped
build:
context: .
dockerfile: Dockerfile
ports:
- "5060:5060"
depends_on:
ollama-gpu:
condition: service_healthy
deploy:
resources:
reservations:
Expand All @@ -11,4 +56,11 @@ services:
count: 1
capabilities: [ gpu ]
environment:
- RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
- RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
- OLLAMA_HOST=http://ollama-gpu:11434
networks:
- pdf-analysis-network

networks:
pdf-analysis-network:
driver: bridge
Loading