huridocs · ali6parmak · Oct 10, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 2, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -2,7 +2,7 @@ FROM pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 
 RUN apt-get update
-RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc
+RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc curl
 
 
 RUN apt-get install -y ocrmypdf
@@ -52,4 +52,3 @@ RUN python src/download_models.py
 ENV PYTHONPATH "${PYTHONPATH}:/app/src"
 ENV TRANSFORMERS_VERBOSITY=error
 ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
-
diff --git a/Dockerfile.ollama b/Dockerfile.ollama
@@ -0,0 +1,10 @@
+FROM ollama/ollama:latest
+
+RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
+
+ENV OLLAMA_HOST=0.0.0.0:11434
+
+EXPOSE 11434
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
+    CMD curl -f http://localhost:11434/api/tags || exit 1
diff --git a/Makefile b/Makefile
@@ -1,5 +1,35 @@
 HAS_GPU := $(shell command -v nvidia-smi > /dev/null && echo 1 || echo 0)
 
+help:
+	@echo "PDF Document Layout Analysis - Available Commands:"
+	@echo ""
+	@echo "📄 Standard PDF Analysis (main app only):"
+	@echo "  make start              - Auto-detects GPU, starts main app only"
+	@echo "  make start_no_gpu       - Forces CPU mode, starts main app only"
+	@echo "  make start_detached     - Background mode, main app only (CPU)"
+	@echo "  make start_detached_gpu - Background mode, main app only (GPU)"
+	@echo ""
+	@echo "🌐 With Translation Features (includes Ollama):"
+	@echo "  make start_translation           - Auto-detects GPU, includes Ollama"
+	@echo "  make start_translation_no_gpu    - Forces CPU mode, includes Ollama"
+	@echo ""
+	@echo "🧪 Testing & Utilities:"
+	@echo "  make test              - Run Python tests"
+	@echo "  make stop              - Stop all services"
+	@echo ""
+	@echo "🔧 Development:"
+	@echo "  make install_venv      - Create virtual environment"
+	@echo "  make install           - Install dependencies"
+	@echo "  make formatter         - Format code with black"
+	@echo "  make check_format      - Check code formatting"
+	@echo ""
+	@echo "🧹 Cleanup:"
+	@echo "  make remove_docker_containers - Remove Docker containers"
+	@echo "  make remove_docker_images     - Remove Docker images"
+	@echo "  make free_up_space           - Free up system space"
+	@echo ""
+	@echo "💡 Tip: Use 'make start' for basic PDF analysis, 'make start_translation' for translation features"
+
 install:
 	. .venv/bin/activate; pip install -Ur requirements.txt
 
@@ -31,19 +61,88 @@ else
 endif
 ifeq ($(HAS_GPU), 1)
 	@echo "NVIDIA GPU detected, using docker-compose-gpu.yml"
-	docker compose -f docker-compose-gpu.yml up --build
+	docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu
 else
 	@echo "No NVIDIA GPU detected, using docker-compose.yml"
-	docker compose -f docker-compose.yml up --build
+	docker compose -f docker-compose.yml up --build pdf-document-layout-analysis
 endif
 
 
 start_no_gpu:
 	mkdir -p ./models
-	docker compose up --build
+	@echo "Starting with CPU-only configuration"
+	docker compose up --build pdf-document-layout-analysis
+
+start_translation:
+ifeq ($(OS), Windows_NT)
+	if not exist models mkdir models
+else
+	mkdir -p ./models
+endif
+ifeq ($(HAS_GPU), 1)
+	@echo "NVIDIA GPU detected, starting with translation support (GPU-enabled Ollama)"
+	@echo "Starting Ollama GPU container first..."
+	docker compose -f docker-compose-gpu.yml up -d ollama-gpu
+	@echo "Waiting for Ollama to be healthy..."
+	@timeout=60; while [ $$timeout -gt 0 ]; do \
+		if docker inspect --format='{{.State.Health.Status}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then \
+			echo "Ollama GPU container is healthy!"; \
+			break; \
+		fi; \
+		echo "Waiting for Ollama GPU container to be healthy... ($$timeout seconds remaining)"; \
+		sleep 5; \
+		timeout=$$((timeout-5)); \
+	done
+	@if ! docker inspect --format='{{.State.Health.Status}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then \
+		echo "Warning: Ollama GPU container may not be fully healthy yet, but continuing..."; \
+	fi
+	@echo "Starting all services with translation support..."
+	docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu-translation
+else
+	@echo "No NVIDIA GPU detected, starting with translation support (CPU Ollama)"
+	@echo "Starting Ollama container first..."
+	docker compose -f docker-compose.yml up -d ollama
+	@echo "Waiting for Ollama to be healthy..."
+	@timeout=60; while [ $$timeout -gt 0 ]; do \
+		if docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
+			echo "Ollama container is healthy!"; \
+			break; \
+		fi; \
+		echo "Waiting for Ollama container to be healthy... ($$timeout seconds remaining)"; \
+		sleep 5; \
+		timeout=$$((timeout-5)); \
+	done
+	@if ! docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
+		echo "Warning: Ollama container may not be fully healthy yet, but continuing..."; \
+	fi
+	@echo "Starting all services with translation support..."
+	docker compose -f docker-compose.yml up --build pdf-document-layout-analysis-translation
+endif
+
+start_translation_no_gpu:
+	mkdir -p ./models
+	@echo "Starting with CPU-only configuration and translation support"
+	@echo "Starting Ollama container first..."
+	docker compose up -d ollama
+	@echo "Waiting for Ollama to be healthy..."
+	@timeout=60; while [ $$timeout -gt 0 ]; do \
+		if docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
+			echo "Ollama container is healthy!"; \
+			break; \
+		fi; \
+		echo "Waiting for Ollama container to be healthy... ($$timeout seconds remaining)"; \
+		sleep 5; \
+		timeout=$$((timeout-5)); \
+	done
+	@if ! docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
+		echo "Warning: Ollama container may not be fully healthy yet, but continuing..."; \
+	fi
+	@echo "Starting all services with translation support..."
+	docker compose up --build pdf-document-layout-analysis-translation
 
 stop:
 	docker compose stop
+	docker compose -f docker-compose-gpu.yml stop
 
 test:
 	. .venv/bin/activate; command cd src; command python -m pytest
@@ -68,11 +167,18 @@ free_up_space:
 
 start_detached:
 	mkdir -p ./models
-	docker compose up --build -d
+	@echo "Starting in detached mode"
+	docker compose up --build -d pdf-document-layout-analysis
+	@echo "Main application started in background. Check status with: docker compose ps"
+	@echo "View logs with: docker compose logs -f pdf-document-layout-analysis"
 
 start_detached_gpu:
 	mkdir -p ./models
-	RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d
+	@echo "Starting in detached mode with GPU"
+	RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d pdf-document-layout-analysis-gpu
+	@echo "Main application started in background. Check status with: docker compose ps"
+	@echo "View logs with: docker compose logs -f pdf-document-layout-analysis-gpu"
+
 
 upgrade:
 	. .venv/bin/activate; pip-upgrade
diff --git a/README.md b/README.md
@@ -66,18 +66,23 @@ This project provides a powerful and flexible PDF analysis microservice built wi
 
 ### 1. Start the Service
 
-**With GPU support (recommended for better performance):**
+**Standard PDF Analysis (recommended for most users):**
 ```bash
 make start
 ```
 
-**Without GPU support:**
+**With Translation Features (includes Ollama container):**
 ```bash
-make start_no_gpu
+make start_translation
 ```
 
 The service will be available at `http://localhost:5060`
 
+**See all available commands:**
+```bash
+make help
+```
+
 **Check service status:**
 
 ```bash
@@ -170,8 +175,8 @@ The service provides a comprehensive RESTful API with the following endpoints:
 
 | Endpoint | Method | Description | Parameters |
 |----------|--------|-------------|------------|
-| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
-| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
+| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file`, `target_languages`, `translation_model` |
+| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file`, `target_languages`, `translation_model` |
 | `/visualize` | POST | Visualize segmentation results on the PDF | `file`, `fast` |
 
 ### OCR & Utility Endpoints
@@ -192,6 +197,8 @@ The service provides a comprehensive RESTful API with the following endpoints:
 - **`types`**: Comma-separated content types to extract (string, default: "all")
 - **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
 - **`dpi`**: Image resolution for conversion (integer, default: 120)
+- **`target_languages`**: Comma-separated list of target languages for translation (e.g. "Turkish, Spanish, French")
+- **`translation_model`**: Ollama model to use for translation (string, default: "gpt-oss")
 
 ## 💡 Usage Examples
 
@@ -254,15 +261,75 @@ curl -X POST http://localhost:5060/markdown \
 curl -X POST http://localhost:5060/html \
   -F 'file=@document.pdf' \
   -F 'extract_toc=true' \
-  -F 'output_file=document.html' \
+  -F 'output_file=document.md' \
+  --output 'document.zip'
+```
+
+**Convert to Markdown with Translation:**
+```bash
+curl -X POST http://localhost:5060/markdown \
+  -F 'file=@document.pdf' \
+  -F 'output_file=document.md' \
+  -F 'target_languages=Turkish, Spanish' \
+  -F 'translation_model=gpt-oss' \
+  --output 'document.zip'
+```
+
+**Convert to HTML with Translation:**
+```bash
+curl -X POST http://localhost:5060/html \
+  -F 'file=@document.pdf' \
+  -F 'output_file=document.md' \
+  -F 'target_languages=French, Russian' \
+  -F 'translation_model=huihui_ai/hunyuan-mt-abliterated' \
   --output 'document.zip'
 ```
 
-> **📋 Segmentation Data**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains a `{filename}_segmentation.json` file with information about each detected document segment including:
-> - **Coordinates**: `left`, `top`, `width`, `height`
-> - **Page information**: `page_number`, `page_width`, `page_height` 
-> - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
+> **📋 Segmentation Data & Translations**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains:
+> - **Original file**: The converted document in the requested format
+> - **Segmentation data**: `{filename}_segmentation.json` file with information about each detected document segment:
+>   - **Coordinates**: `left`, `top`, `width`, `height`
+>   - **Page information**: `page_number`, `page_width`, `page_height` 
+>   - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
+> - **Translated files** (if `target_languages` specified): `{filename}_{language}.{extension}` for each target language
+> - **Images** (if present): `{filename}_pictures/` directory containing extracted images
+
+### Translation Features
+
+The `/markdown` and `/html` endpoints support automatic translation of the converted content into multiple languages using Ollama models.
 
+**Translation Requirements:**
+- The specified translation model must be available in Ollama
+- An `output_file` must be specified (translations are only included in zip responses)
+
+**Supported Translation Models:**
+- Any Ollama-compatible model (e.g., `gpt-oss`, `llama2`, `mistral`, etc.)
+- Models are automatically downloaded if not present locally
+
+**Translation Process:**
+1. The service checks if the specified model is available in Ollama
+2. If not available, it attempts to download the model using `ollama pull`
+3. For each target language, the content is translated while preserving:
+   - Original formatting and structure
+   - Markdown/HTML syntax
+   - Links and references
+   - Image references and tables
+4. Translated files are named: `{filename}_{language}.{extension}`
+
+_**Note that the quality of translations mostly depends on the models used. When using smaller models, the output may contain many unexpected or undesired elements. For regular users, we aimed for a balance between performance and quality, so we tested with different models with a reasonable size. The results for `gpt-oss` were satisfactory, which is why we set it as the default model. If you need something smaller you can also try `huihui_ai/hunyuan-mt-abliterated`, we saw it gives decent results especially if the text does not have much styling.**_
+
+**Example Translation Output:**
+```
+document.zip
+├── document.md                   # Source text with markdown/html styling
+├── document_Spanish.md           # Spanish translation  
+├── document_French.md            # French translation
+├── document_Turkish.md           # Turkish translation
+├── document_segmentation.json    # Segmentation information
+└── document_pictures/       # (if images present)
+    ├── document_1_1.png
+    └── document_1_2.png
+```
 
 ### OCR Processing
 

diff --git a/docker-compose-gpu.yml b/docker-compose-gpu.yml
@@ -1,8 +1,53 @@
 services:
-  pdf-document-layout-analysis-gpu:
+  ollama-gpu:
     extends:
       file: docker-compose.yml
-      service: pdf-document-layout-analysis
+      service: ollama
+    container_name: ollama-service-gpu
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [ gpu ]
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+
+  pdf-document-layout-analysis-gpu:
+    container_name: pdf-document-layout-analysis-gpu
+    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
+    init: true
+    restart: unless-stopped
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "5060:5060"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [ gpu ]
+    environment:
+      - RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
+      - OLLAMA_HOST=http://localhost:11434
+
+  pdf-document-layout-analysis-gpu-translation:
+    container_name: pdf-document-layout-analysis-gpu-translation
+    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
+    init: true
+    restart: unless-stopped
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "5060:5060"
+    depends_on:
+      ollama-gpu:
+        condition: service_healthy
     deploy:
       resources:
         reservations:
@@ -11,4 +56,11 @@ services:
               count: 1
               capabilities: [ gpu ]
     environment:
-      - RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
+      - RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
+      - OLLAMA_HOST=http://ollama-gpu:11434
+    networks:
+      - pdf-analysis-network
+
+networks:
+  pdf-analysis-network:
+    driver: bridge