diff --git a/Dockerfile b/Dockerfile
index aaf501a..7c4ecd2 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ FROM pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN apt-get update
-RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc
+RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc curl
RUN apt-get install -y ocrmypdf
@@ -52,4 +52,3 @@ RUN python src/download_models.py
ENV PYTHONPATH "${PYTHONPATH}:/app/src"
ENV TRANSFORMERS_VERBOSITY=error
ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
-
diff --git a/Dockerfile.ollama b/Dockerfile.ollama
new file mode 100644
index 0000000..d7047bc
--- /dev/null
+++ b/Dockerfile.ollama
@@ -0,0 +1,10 @@
+FROM ollama/ollama:latest
+
+RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
+
+ENV OLLAMA_HOST=0.0.0.0:11434
+
+EXPOSE 11434
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
+ CMD curl -f http://localhost:11434/api/tags || exit 1
diff --git a/Makefile b/Makefile
index 10db0fd..d0db65f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,35 @@
HAS_GPU := $(shell command -v nvidia-smi > /dev/null && echo 1 || echo 0)
+help:
+ @echo "PDF Document Layout Analysis - Available Commands:"
+ @echo ""
+ @echo "๐ Standard PDF Analysis (main app only):"
+ @echo " make start - Auto-detects GPU, starts main app only"
+ @echo " make start_no_gpu - Forces CPU mode, starts main app only"
+ @echo " make start_detached - Background mode, main app only (CPU)"
+ @echo " make start_detached_gpu - Background mode, main app only (GPU)"
+ @echo ""
+ @echo "๐ With Translation Features (includes Ollama):"
+ @echo " make start_translation - Auto-detects GPU, includes Ollama"
+ @echo " make start_translation_no_gpu - Forces CPU mode, includes Ollama"
+ @echo ""
+ @echo "๐งช Testing & Utilities:"
+ @echo " make test - Run Python tests"
+ @echo " make stop - Stop all services"
+ @echo ""
+ @echo "๐ง Development:"
+ @echo " make install_venv - Create virtual environment"
+ @echo " make install - Install dependencies"
+ @echo " make formatter - Format code with black"
+ @echo " make check_format - Check code formatting"
+ @echo ""
+ @echo "๐งน Cleanup:"
+ @echo " make remove_docker_containers - Remove Docker containers"
+ @echo " make remove_docker_images - Remove Docker images"
+ @echo " make free_up_space - Free up system space"
+ @echo ""
+ @echo "๐ก Tip: Use 'make start' for basic PDF analysis, 'make start_translation' for translation features"
+
install:
. .venv/bin/activate; pip install -Ur requirements.txt
@@ -31,19 +61,88 @@ else
endif
ifeq ($(HAS_GPU), 1)
@echo "NVIDIA GPU detected, using docker-compose-gpu.yml"
- docker compose -f docker-compose-gpu.yml up --build
+ docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu
else
@echo "No NVIDIA GPU detected, using docker-compose.yml"
- docker compose -f docker-compose.yml up --build
+ docker compose -f docker-compose.yml up --build pdf-document-layout-analysis
endif
start_no_gpu:
mkdir -p ./models
- docker compose up --build
+ @echo "Starting with CPU-only configuration"
+ docker compose up --build pdf-document-layout-analysis
+
+start_translation:
+ifeq ($(OS), Windows_NT)
+ if not exist models mkdir models
+else
+ mkdir -p ./models
+endif
+ifeq ($(HAS_GPU), 1)
+ @echo "NVIDIA GPU detected, starting with translation support (GPU-enabled Ollama)"
+ @echo "Starting Ollama GPU container first..."
+ docker compose -f docker-compose-gpu.yml up -d ollama-gpu
+ @echo "Waiting for Ollama to be healthy..."
+ @timeout=60; while [ $$timeout -gt 0 ]; do \
+ if docker inspect --format='{{.State.Health.Status}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then \
+ echo "Ollama GPU container is healthy!"; \
+ break; \
+ fi; \
+ echo "Waiting for Ollama GPU container to be healthy... ($$timeout seconds remaining)"; \
+ sleep 5; \
+ timeout=$$((timeout-5)); \
+ done
+ @if ! docker inspect --format='{{.State.Health.Status}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then \
+ echo "Warning: Ollama GPU container may not be fully healthy yet, but continuing..."; \
+ fi
+ @echo "Starting all services with translation support..."
+ docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu-translation
+else
+ @echo "No NVIDIA GPU detected, starting with translation support (CPU Ollama)"
+ @echo "Starting Ollama container first..."
+ docker compose -f docker-compose.yml up -d ollama
+ @echo "Waiting for Ollama to be healthy..."
+ @timeout=60; while [ $$timeout -gt 0 ]; do \
+ if docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
+ echo "Ollama container is healthy!"; \
+ break; \
+ fi; \
+ echo "Waiting for Ollama container to be healthy... ($$timeout seconds remaining)"; \
+ sleep 5; \
+ timeout=$$((timeout-5)); \
+ done
+ @if ! docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
+ echo "Warning: Ollama container may not be fully healthy yet, but continuing..."; \
+ fi
+ @echo "Starting all services with translation support..."
+ docker compose -f docker-compose.yml up --build pdf-document-layout-analysis-translation
+endif
+
+start_translation_no_gpu:
+ mkdir -p ./models
+ @echo "Starting with CPU-only configuration and translation support"
+ @echo "Starting Ollama container first..."
+ docker compose up -d ollama
+ @echo "Waiting for Ollama to be healthy..."
+ @timeout=60; while [ $$timeout -gt 0 ]; do \
+ if docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
+ echo "Ollama container is healthy!"; \
+ break; \
+ fi; \
+ echo "Waiting for Ollama container to be healthy... ($$timeout seconds remaining)"; \
+ sleep 5; \
+ timeout=$$((timeout-5)); \
+ done
+ @if ! docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
+ echo "Warning: Ollama container may not be fully healthy yet, but continuing..."; \
+ fi
+ @echo "Starting all services with translation support..."
+ docker compose up --build pdf-document-layout-analysis-translation
stop:
docker compose stop
+ docker compose -f docker-compose-gpu.yml stop
test:
. .venv/bin/activate; command cd src; command python -m pytest
@@ -68,11 +167,18 @@ free_up_space:
start_detached:
mkdir -p ./models
- docker compose up --build -d
+ @echo "Starting in detached mode"
+ docker compose up --build -d pdf-document-layout-analysis
+ @echo "Main application started in background. Check status with: docker compose ps"
+ @echo "View logs with: docker compose logs -f pdf-document-layout-analysis"
start_detached_gpu:
mkdir -p ./models
- RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d
+ @echo "Starting in detached mode with GPU"
+ RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d pdf-document-layout-analysis-gpu
+ @echo "Main application started in background. Check status with: docker compose ps"
+ @echo "View logs with: docker compose logs -f pdf-document-layout-analysis-gpu"
+
upgrade:
. .venv/bin/activate; pip-upgrade
\ No newline at end of file
diff --git a/README.md b/README.md
index e6533ce..8e72db2 100644
--- a/README.md
+++ b/README.md
@@ -66,18 +66,23 @@ This project provides a powerful and flexible PDF analysis microservice built wi
### 1. Start the Service
-**With GPU support (recommended for better performance):**
+**Standard PDF Analysis (recommended for most users):**
```bash
make start
```
-**Without GPU support:**
+**With Translation Features (includes Ollama container):**
```bash
-make start_no_gpu
+make start_translation
```
The service will be available at `http://localhost:5060`
+**See all available commands:**
+```bash
+make help
+```
+
**Check service status:**
```bash
@@ -170,8 +175,8 @@ The service provides a comprehensive RESTful API with the following endpoints:
| Endpoint | Method | Description | Parameters |
|----------|--------|-------------|------------|
-| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
-| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
+| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file`, `target_languages`, `translation_model` |
+| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file`, `target_languages`, `translation_model` |
| `/visualize` | POST | Visualize segmentation results on the PDF | `file`, `fast` |
### OCR & Utility Endpoints
@@ -192,6 +197,8 @@ The service provides a comprehensive RESTful API with the following endpoints:
- **`types`**: Comma-separated content types to extract (string, default: "all")
- **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
- **`dpi`**: Image resolution for conversion (integer, default: 120)
+- **`target_languages`**: Comma-separated list of target languages for translation (e.g. "Turkish, Spanish, French")
+- **`translation_model`**: Ollama model to use for translation (string, default: "gpt-oss")
## ๐ก Usage Examples
@@ -254,15 +261,75 @@ curl -X POST http://localhost:5060/markdown \
curl -X POST http://localhost:5060/html \
-F 'file=@document.pdf' \
-F 'extract_toc=true' \
- -F 'output_file=document.html' \
+ -F 'output_file=document.md' \
+ --output 'document.zip'
+```
+
+**Convert to Markdown with Translation:**
+```bash
+curl -X POST http://localhost:5060/markdown \
+ -F 'file=@document.pdf' \
+ -F 'output_file=document.md' \
+ -F 'target_languages=Turkish, Spanish' \
+ -F 'translation_model=gpt-oss' \
+ --output 'document.zip'
+```
+
+**Convert to HTML with Translation:**
+```bash
+curl -X POST http://localhost:5060/html \
+ -F 'file=@document.pdf' \
+ -F 'output_file=document.md' \
+ -F 'target_languages=French, Russian' \
+ -F 'translation_model=huihui_ai/hunyuan-mt-abliterated' \
--output 'document.zip'
```
-> **๐ Segmentation Data**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains a `{filename}_segmentation.json` file with information about each detected document segment including:
-> - **Coordinates**: `left`, `top`, `width`, `height`
-> - **Page information**: `page_number`, `page_width`, `page_height`
-> - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
+> **๐ Segmentation Data & Translations**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains:
+> - **Original file**: The converted document in the requested format
+> - **Segmentation data**: `{filename}_segmentation.json` file with information about each detected document segment:
+> - **Coordinates**: `left`, `top`, `width`, `height`
+> - **Page information**: `page_number`, `page_width`, `page_height`
+> - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
+> - **Translated files** (if `target_languages` specified): `{filename}_{language}.{extension}` for each target language
+> - **Images** (if present): `{filename}_pictures/` directory containing extracted images
+
+### Translation Features
+
+The `/markdown` and `/html` endpoints support automatic translation of the converted content into multiple languages using Ollama models.
+**Translation Requirements:**
+- The specified translation model must be available in Ollama
+- An `output_file` must be specified (translations are only included in zip responses)
+
+**Supported Translation Models:**
+- Any Ollama-compatible model (e.g., `gpt-oss`, `llama2`, `mistral`, etc.)
+- Models are automatically downloaded if not present locally
+
+**Translation Process:**
+1. The service checks if the specified model is available in Ollama
+2. If not available, it attempts to download the model using `ollama pull`
+3. For each target language, the content is translated while preserving:
+ - Original formatting and structure
+ - Markdown/HTML syntax
+ - Links and references
+ - Image references and tables
+4. Translated files are named: `{filename}_{language}.{extension}`
+
+_**Note that the quality of translations mostly depends on the models used. When using smaller models, the output may contain many unexpected or undesired elements. For regular users, we aimed for a balance between performance and quality, so we tested with different models with a reasonable size. The results for `gpt-oss` were satisfactory, which is why we set it as the default model. If you need something smaller you can also try `huihui_ai/hunyuan-mt-abliterated`, we saw it gives decent results especially if the text does not have much styling.**_
+
+**Example Translation Output:**
+```
+document.zip
+โโโ document.md # Source text with markdown/html styling
+โโโ document_Spanish.md # Spanish translation
+โโโ document_French.md # French translation
+โโโ document_Turkish.md # Turkish translation
+โโโ document_segmentation.json # Segmentation information
+โโโ document_pictures/ # (if images present)
+ โโโ document_1_1.png
+ โโโ document_1_2.png
+```
### OCR Processing
diff --git a/docker-compose-gpu.yml b/docker-compose-gpu.yml
index d60fb94..45ffb11 100755
--- a/docker-compose-gpu.yml
+++ b/docker-compose-gpu.yml
@@ -1,8 +1,53 @@
services:
- pdf-document-layout-analysis-gpu:
+ ollama-gpu:
extends:
file: docker-compose.yml
- service: pdf-document-layout-analysis
+ service: ollama
+ container_name: ollama-service-gpu
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: 1
+ capabilities: [ gpu ]
+ environment:
+ - NVIDIA_VISIBLE_DEVICES=all
+
+ pdf-document-layout-analysis-gpu:
+ container_name: pdf-document-layout-analysis-gpu
+ entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
+ init: true
+ restart: unless-stopped
+ build:
+ context: .
+ dockerfile: Dockerfile
+ ports:
+ - "5060:5060"
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: 1
+ capabilities: [ gpu ]
+ environment:
+ - RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
+ - OLLAMA_HOST=http://localhost:11434
+
+ pdf-document-layout-analysis-gpu-translation:
+ container_name: pdf-document-layout-analysis-gpu-translation
+ entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
+ init: true
+ restart: unless-stopped
+ build:
+ context: .
+ dockerfile: Dockerfile
+ ports:
+ - "5060:5060"
+ depends_on:
+ ollama-gpu:
+ condition: service_healthy
deploy:
resources:
reservations:
@@ -11,4 +56,11 @@ services:
count: 1
capabilities: [ gpu ]
environment:
- - RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
\ No newline at end of file
+ - RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
+ - OLLAMA_HOST=http://ollama-gpu:11434
+ networks:
+ - pdf-analysis-network
+
+networks:
+ pdf-analysis-network:
+ driver: bridge
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index b69105f..0bf5800 100755
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,4 +1,21 @@
services:
+ ollama:
+ container_name: ollama-service
+ build:
+ context: .
+ dockerfile: Dockerfile.ollama
+ restart: unless-stopped
+ ports:
+ - "11434:11434"
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ start_period: 30s
+ networks:
+ - pdf-analysis-network
+
pdf-document-layout-analysis:
container_name: pdf-document-layout-analysis
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
@@ -9,3 +26,27 @@ services:
dockerfile: Dockerfile
ports:
- "5060:5060"
+ environment:
+ - OLLAMA_HOST=http://localhost:11434
+
+ pdf-document-layout-analysis-translation:
+ container_name: pdf-document-layout-analysis-translation
+ entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
+ init: true
+ restart: unless-stopped
+ build:
+ context: .
+ dockerfile: Dockerfile
+ ports:
+ - "5060:5060"
+ depends_on:
+ ollama:
+ condition: service_healthy
+ environment:
+ - OLLAMA_HOST=http://ollama:11434
+ networks:
+ - pdf-analysis-network
+
+networks:
+ pdf-analysis-network:
+ driver: bridge
diff --git a/justfile b/justfile
index a2a13b3..a049f89 100644
--- a/justfile
+++ b/justfile
@@ -1,5 +1,35 @@
HAS_GPU := `command -v nvidia-smi > /dev/null && echo 1 || echo 0`
+help:
+ @echo "PDF Document Layout Analysis - Available Commands:"
+ @echo ""
+ @echo "๐ Standard PDF Analysis (main app only):"
+ @echo " just start - Auto-detects GPU, starts main app only"
+ @echo " just start_no_gpu - Forces CPU mode, starts main app only"
+ @echo " just start_detached - Background mode, main app only (CPU)"
+ @echo " just start_detached_gpu - Background mode, main app only (GPU)"
+ @echo ""
+ @echo "๐ With Translation Features (includes Ollama):"
+ @echo " just start_translation - Auto-detects GPU, includes Ollama"
+ @echo " just start_translation_no_gpu - Forces CPU mode, includes Ollama"
+ @echo ""
+ @echo "๐งช Testing & Utilities:"
+ @echo " just test - Run Python tests"
+ @echo " just stop - Stop all services"
+ @echo ""
+ @echo "๐ง Development:"
+ @echo " just install_venv - Create virtual environment"
+ @echo " just install - Install dependencies"
+ @echo " just formatter - Format code with black"
+ @echo " just check_format - Check code formatting"
+ @echo ""
+ @echo "๐งน Cleanup:"
+ @echo " just remove_docker_containers - Remove Docker containers"
+ @echo " just remove_docker_images - Remove Docker images"
+ @echo " just free_up_space - Free up system space"
+ @echo ""
+ @echo "๐ก Tip: Use 'just start' for basic PDF analysis, 'just start_translation' for translation features"
+
install:
. .venv/bin/activate; pip install -Ur requirements.txt
@@ -27,18 +57,88 @@ start:
mkdir -p ./models
if [ {{HAS_GPU}} -eq 1 ]; then \
echo "NVIDIA GPU detected, using docker-compose-gpu.yml"; \
- docker compose -f docker-compose-gpu.yml up --build; \
+ docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu; \
else \
echo "No NVIDIA GPU detected, using docker-compose.yml"; \
- docker compose -f docker-compose.yml up --build; \
+ docker compose -f docker-compose.yml up --build pdf-document-layout-analysis; \
fi
start_no_gpu:
mkdir -p ./models
- docker compose up --build
+ @echo "Starting with CPU-only configuration"
+ docker compose up --build pdf-document-layout-analysis
+
+start_translation:
+ #!/bin/bash
+ mkdir -p ./models
+ if [ {{HAS_GPU}} -eq 1 ]; then
+ echo "NVIDIA GPU detected, starting with translation support (GPU-enabled Ollama)"
+ echo "Starting Ollama GPU container first..."
+ docker compose -f docker-compose-gpu.yml up -d ollama-gpu
+ echo "Waiting for Ollama to be healthy..."
+ timeout=60
+ while [ $timeout -gt 0 ]; do
+ if docker inspect --format='{{"{{"}}.State.Health.Status{{"}}"}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then
+ echo "Ollama GPU container is healthy!"
+ break
+ fi
+ echo "Waiting for Ollama GPU container to be healthy... ($timeout seconds remaining)"
+ sleep 5
+ timeout=$((timeout-5))
+ done
+ if ! docker inspect --format='{{"{{"}}.State.Health.Status{{"}}"}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then
+ echo "Warning: Ollama GPU container may not be fully healthy yet, but continuing..."
+ fi
+ echo "Starting all services with translation support..."
+ docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu-translation
+ else
+ echo "No NVIDIA GPU detected, starting with translation support (CPU Ollama)"
+ echo "Starting Ollama container first..."
+ docker compose -f docker-compose.yml up -d ollama
+ echo "Waiting for Ollama to be healthy..."
+ timeout=60
+ while [ $timeout -gt 0 ]; do
+ if docker inspect --format='{{"{{"}}.State.Health.Status{{"}}"}}' ollama-service 2>/dev/null | grep -q "healthy"; then
+ echo "Ollama container is healthy!"
+ break
+ fi
+ echo "Waiting for Ollama container to be healthy... ($timeout seconds remaining)"
+ sleep 5
+ timeout=$((timeout-5))
+ done
+ if ! docker inspect --format='{{"{{"}}.State.Health.Status{{"}}"}}' ollama-service 2>/dev/null | grep -q "healthy"; then
+ echo "Warning: Ollama container may not be fully healthy yet, but continuing..."
+ fi
+ echo "Starting all services with translation support..."
+ docker compose -f docker-compose.yml up --build pdf-document-layout-analysis-translation
+ fi
+
+start_translation_no_gpu:
+ #!/bin/bash
+ mkdir -p ./models
+ echo "Starting with CPU-only configuration and translation support"
+ echo "Starting Ollama container first..."
+ docker compose up -d ollama
+ echo "Waiting for Ollama to be healthy..."
+ timeout=60
+ while [ $timeout -gt 0 ]; do
+ if docker inspect --format='{{"{{"}}.State.Health.Status{{"}}"}}' ollama-service 2>/dev/null | grep -q "healthy"; then
+ echo "Ollama container is healthy!"
+ break
+ fi
+ echo "Waiting for Ollama container to be healthy... ($timeout seconds remaining)"
+ sleep 5
+ timeout=$((timeout-5))
+ done
+ if ! docker inspect --format='{{"{{"}}.State.Health.Status{{"}}"}}' ollama-service 2>/dev/null | grep -q "healthy"; then
+ echo "Warning: Ollama container may not be fully healthy yet, but continuing..."
+ fi
+ echo "Starting all services with translation support..."
+ docker compose up --build pdf-document-layout-analysis-translation
stop:
docker compose stop
+ docker compose -f docker-compose-gpu.yml stop
test:
. .venv/bin/activate; command cd src; command python -m pytest
@@ -62,11 +162,17 @@ free_up_space:
start_detached:
mkdir -p ./models
- docker compose up --build -d
+ @echo "Starting in detached mode"
+ docker compose up --build -d pdf-document-layout-analysis
+ @echo "Main application started in background. Check status with: docker compose ps"
+ @echo "View logs with: docker compose logs -f pdf-document-layout-analysis"
start_detached_gpu:
mkdir -p ./models
- RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d
+ @echo "Starting in detached mode with GPU"
+ RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d pdf-document-layout-analysis-gpu
+ @echo "Main application started in background. Check status with: docker compose ps"
+ @echo "View logs with: docker compose logs -f pdf-document-layout-analysis-gpu"
upgrade:
. .venv/bin/activate; pip-upgrade
diff --git a/requirements.txt b/requirements.txt
index 2423e3a..6cc74eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,4 +24,5 @@ rapidocr==3.2.0
pix2tex==0.1.4
latex2mathml==3.78.0
PyMuPDF==1.25.5
+ollama==0.6.0
git+https://github.com/huridocs/pdf-features.git@2025.10.1.1
\ No newline at end of file
diff --git a/src/adapters/infrastructure/html_conversion_service_adapter.py b/src/adapters/infrastructure/html_conversion_service_adapter.py
index b41e202..8b51fcf 100644
--- a/src/adapters/infrastructure/html_conversion_service_adapter.py
+++ b/src/adapters/infrastructure/html_conversion_service_adapter.py
@@ -19,5 +19,9 @@ def convert_to_html(
extract_toc: bool = False,
dpi: int = 120,
output_file: Optional[str] = None,
+ target_languages: Optional[list[str]] = None,
+ translation_model: str = "gpt-oss",
) -> Union[str, Response]:
- return self.convert_to_format(pdf_content, segments, extract_toc, dpi, output_file)
+ return self.convert_to_format(
+ pdf_content, segments, extract_toc, dpi, output_file, target_languages, translation_model
+ )
diff --git a/src/adapters/infrastructure/markdown_conversion_service_adapter.py b/src/adapters/infrastructure/markdown_conversion_service_adapter.py
index 2b37115..ec57b5a 100644
--- a/src/adapters/infrastructure/markdown_conversion_service_adapter.py
+++ b/src/adapters/infrastructure/markdown_conversion_service_adapter.py
@@ -19,5 +19,9 @@ def convert_to_markdown(
extract_toc: bool = False,
dpi: int = 120,
output_file: Optional[str] = None,
+ target_languages: Optional[list[str]] = None,
+ translation_model: str = "gpt-oss",
) -> Union[str, Response]:
- return self.convert_to_format(pdf_content, segments, extract_toc, dpi, output_file)
+ return self.convert_to_format(
+ pdf_content, segments, extract_toc, dpi, output_file, target_languages, translation_model
+ )
diff --git a/src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py b/src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py
index f4da869..08e50ac 100644
--- a/src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py
+++ b/src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py
@@ -10,6 +10,7 @@
from pdf2image import convert_from_path
from starlette.responses import Response
+from configuration import service_logger
from domain.SegmentBox import SegmentBox
from pdf_features.PdfFeatures import PdfFeatures
from pdf_features.PdfToken import PdfToken
@@ -22,6 +23,8 @@
from adapters.infrastructure.markup_conversion.OutputFormat import OutputFormat
from adapters.infrastructure.markup_conversion.Link import Link
from adapters.infrastructure.markup_conversion.ExtractedImage import ExtractedImage
+from adapters.infrastructure.translation.ollama_container_manager import OllamaContainerManager
+from adapters.infrastructure.translation.translate_markup_document import translate_markup
class PdfToMarkupServiceAdapter:
@@ -35,6 +38,8 @@ def convert_to_format(
extract_toc: bool = False,
dpi: int = 120,
output_file: Optional[str] = None,
+ target_languages: Optional[list[str]] = None,
+ translation_model: str = "gpt-oss",
) -> Union[str, Response]:
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
temp_file.write(pdf_content)
@@ -44,10 +49,19 @@ def convert_to_format(
extracted_images: list[ExtractedImage] = [] if output_file else None
user_base_name = Path(output_file).stem if output_file else None
- content = self._generate_content(temp_pdf_path, segments, extract_toc, dpi, extracted_images, user_base_name)
+ content_parts = self._get_styled_content_parts(
+ temp_pdf_path, segments, extract_toc, dpi, extracted_images, user_base_name
+ )
+ content = "".join(content_parts)
if output_file:
- return self._create_zip_response(content, extracted_images, output_file, segments)
+ translations = {}
+ if target_languages and len(target_languages) > 0 and content_parts:
+ translations = self._generate_translations(
+ segments, content_parts, target_languages, translation_model, extract_toc
+ )
+
+ return self._create_zip_response(content, extracted_images, output_file, segments, translations)
return content
finally:
@@ -60,6 +74,7 @@ def _create_zip_response(
extracted_images: list[ExtractedImage],
output_filename: str,
segments: list[SegmentBox],
+ translations: Optional[dict[str, str]] = None,
) -> Response:
zip_buffer = io.BytesIO()
@@ -73,6 +88,12 @@ def _create_zip_response(
for image in extracted_images:
zip_file.writestr(f"{pictures_dir}{image.filename}", image.image_data)
+ if translations:
+ output_path = Path(output_filename)
+ for language, translated_content in translations.items():
+ translated_filename = f"{output_path.stem}_{language}{output_path.suffix}"
+ zip_file.writestr(translated_filename, translated_content.encode("utf-8"))
+
base_name = Path(output_filename).stem
segmentation_filename = f"{base_name}_segmentation.json"
segmentation_data = self._create_segmentation_json(segments)
@@ -93,6 +114,29 @@ def _create_segmentation_json(self, segments: list[SegmentBox]) -> str:
segmentation_data.append(segment.to_dict())
return json.dumps(segmentation_data, indent=4, ensure_ascii=False)
+ def _generate_translations(
+ self,
+ segments: list[SegmentBox],
+ content_parts: list[str],
+ target_languages: list[str],
+ translation_model: str,
+ extract_toc: bool = False,
+ ) -> dict[str, str]:
+ translations = {}
+
+ ollama_manager = OllamaContainerManager()
+ if not ollama_manager.ensure_service_ready(translation_model):
+ return translations
+
+ for target_language in target_languages:
+ service_logger.info(f"\033[96mTranslating content to {target_language}\033[0m")
+ translated_content = translate_markup(
+ ollama_manager, self.output_format, segments, content_parts, translation_model, target_language, extract_toc
+ )
+ translations[target_language] = translated_content
+
+ return translations
+
def _create_pdf_labels_from_segments(self, vgt_segments: list[SegmentBox]) -> PdfLabels:
page_numbers = sorted(set(segment.page_number for segment in vgt_segments))
page_labels: list[PageLabels] = []
@@ -309,7 +353,7 @@ def _set_segment_ids(self, vgt_segments: list[SegmentBox]) -> None:
for segment_index, segment in enumerate(segments):
segment.id = f"page-{page_number}-{segment_index}"
- def _generate_content(
+ def _get_styled_content_parts(
self,
pdf_path: Path,
vgt_segments: list[SegmentBox],
@@ -358,4 +402,4 @@ def _generate_content(
self._process_regular_segment(tokens_in_seg, segment, links_by_source, links_by_dest)
)
- return "".join(content_parts)
+ return content_parts
diff --git a/src/adapters/infrastructure/translation/decode_html_content.py b/src/adapters/infrastructure/translation/decode_html_content.py
new file mode 100644
index 0000000..58b56c4
--- /dev/null
+++ b/src/adapters/infrastructure/translation/decode_html_content.py
@@ -0,0 +1,48 @@
+import re
+
+
+def decode_html(text, link_map, doc_ref_map):
+ # 1. Decode bold+italic first
+ def bold_italic_decoder(match):
+ return f"{match.group(2)}"
+
+ text = re.sub(r"\[BI(\d+)\](.*?)\[BI\1\]", bold_italic_decoder, text)
+
+ # 2. Decode bold
+ def bold_decoder(match):
+ return f"{match.group(2)}"
+
+ text = re.sub(r"\[B(\d+)\](.*?)\[B\1\]", bold_decoder, text)
+
+ # 3. Decode italic
+ def italic_decoder(match):
+ return f"{match.group(2)}"
+
+ text = re.sub(r"\[IT(\d+)\](.*?)\[IT\1\]", italic_decoder, text)
+
+ # 4. Decode links
+ def link_decoder(match):
+ idx = int(match.group(1))
+ if idx < len(link_map):
+ label, url = link_map[idx]
+ return f'{match.group(2)}'
+ else:
+ # Return original text if index is out of range
+ return match.group(0)
+
+ text = re.sub(r"\[LINK(\d+)\](.*?)\[LINK\1\]", link_decoder, text)
+
+ # 5. Decode doc refs (same as markdown since they're custom)
+ def doc_ref_decoder(match):
+ idx = int(match.group(1))
+ if idx < len(doc_ref_map):
+ return doc_ref_map[idx]
+ else:
+ # Return original text if index is out of range
+ return match.group(0)
+
+ text = re.sub(r"\[DOCREF(\d+)\]", doc_ref_decoder, text)
+ text = text.replace("] (#page", "](#page")
+ text = " ".join(text.split())
+
+ return text
diff --git a/src/adapters/infrastructure/translation/decode_markdown_content.py b/src/adapters/infrastructure/translation/decode_markdown_content.py
new file mode 100644
index 0000000..663314b
--- /dev/null
+++ b/src/adapters/infrastructure/translation/decode_markdown_content.py
@@ -0,0 +1,46 @@
+import re
+
+
+def decode_markdown(text, link_map, doc_ref_map):
+ # 1. Decode bold+italic first
+ def bold_italic_decoder(match):
+ return f"**_{match.group(2)}_**"
+
+ text = re.sub(r"\[BI(\d+)\](.*?)\[BI\1\]", bold_italic_decoder, text)
+
+ # 2. Decode bold
+ def bold_decoder(match):
+ return f"**{match.group(2)}**"
+
+ text = re.sub(r"\[B(\d+)\](.*?)\[B\1\]", bold_decoder, text)
+
+ # 3. Decode italic
+ def italic_decoder(match):
+ return f"_{match.group(2)}_"
+
+ text = re.sub(r"\[IT(\d+)\](.*?)\[IT\1\]", italic_decoder, text)
+
+ # 4. Decode links
+ def link_decoder(match):
+ idx = int(match.group(1))
+ if idx < len(link_map):
+ label, url = link_map[idx]
+ return f"[{match.group(2)}]({url})"
+ else:
+ return match.group(0)
+
+ text = re.sub(r"\[LINK(\d+)\](.*?)\[LINK\1\]", link_decoder, text)
+
+ # 5. Decode doc refs
+ def doc_ref_decoder(match):
+ idx = int(match.group(1))
+ if idx < len(doc_ref_map):
+ return doc_ref_map[idx]
+ else:
+ return match.group(0)
+
+ text = re.sub(r"\[DOCREF(\d+)\]", doc_ref_decoder, text)
+ text = text.replace("] (#page", "](#page")
+ text = " ".join(text.split())
+
+ return text
diff --git a/src/adapters/infrastructure/translation/download_translation_model.py b/src/adapters/infrastructure/translation/download_translation_model.py
new file mode 100644
index 0000000..d45c449
--- /dev/null
+++ b/src/adapters/infrastructure/translation/download_translation_model.py
@@ -0,0 +1,69 @@
+import subprocess
+import time
+from configuration import service_logger
+
+OLLAMA_NOT_RUNNING_MSG = "could not connect to ollama server"
+
+
+def is_ollama_running():
+ try:
+ result = subprocess.run(["ollama", "ls"], capture_output=True, text=True)
+ msg = result.stderr.lower() + result.stdout.lower()
+ return OLLAMA_NOT_RUNNING_MSG not in msg and result.returncode == 0
+ except FileNotFoundError:
+ service_logger.error("Ollama is not installed or not in PATH.")
+ return False
+
+
+def start_ollama():
+ try:
+ subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+ service_logger.info("Starting Ollama server...")
+ time.sleep(5)
+ return True
+ except Exception as e:
+ service_logger.error(f"Failed to start Ollama server: {e}")
+ return False
+
+
+def model_name_variants(name):
+ base = name.split(":")[0]
+ return {base, f"{base}:latest", name}
+
+
+def ensure_ollama_model(model_name):
+ if not is_ollama_running():
+ service_logger.info("Ollama server is not running. Attempting to start it...")
+ if not start_ollama():
+ service_logger.error("Could not start Ollama server. Exiting.")
+ return False
+ for _ in range(5):
+ if is_ollama_running():
+ break
+ time.sleep(2)
+ else:
+ service_logger.error("Ollama server did not start in time.")
+ return False
+
+ try:
+ result = subprocess.run(["ollama", "ls"], capture_output=True, text=True, check=True)
+ except subprocess.CalledProcessError as e:
+ service_logger.error(f"Error running 'ollama ls': {e}")
+ return False
+
+ model_lines = [line.split()[0] for line in result.stdout.splitlines() if line and not line.startswith("NAME")]
+ available_models = set(model_lines)
+ variants = model_name_variants(model_name)
+
+ if available_models & variants:
+ service_logger.info(f"Model '{model_name}' already exists in Ollama.")
+ return True
+
+ service_logger.info(f"Model '{model_name}' not found. Pulling...")
+ try:
+ subprocess.run(["ollama", "pull", model_name], check=True)
+ service_logger.info(f"Model '{model_name}' pulled successfully.")
+ return True
+ except subprocess.CalledProcessError as e:
+ service_logger.error(f"Failed to pull model '{model_name}': {e}")
+ return False
diff --git a/src/adapters/infrastructure/translation/encode_html_content.py b/src/adapters/infrastructure/translation/encode_html_content.py
new file mode 100644
index 0000000..d5d5d16
--- /dev/null
+++ b/src/adapters/infrastructure/translation/encode_html_content.py
@@ -0,0 +1,128 @@
+import re
+
+
+def encode_html(text):
+ text = text.replace(" ", " ")
+
+ link_map = []
+ doc_ref_map = []
+ bold_map = []
+ italic_map = []
+ bold_italic_map = []
+
+ # Helper to encode doc refs
+ def doc_ref_replacer(match):
+ idx = len(doc_ref_map)
+ doc_ref_map.append(match.group(0))
+ return f"[DOCREF{idx}]"
+
+ # Helper to encode bold+italic
+ def bold_italic_replacer(match):
+ text_content = match.group(1)
+ idx = len(bold_italic_map)
+ bold_italic_map.append(text_content)
+ return f"[BI{idx}]{text_content}[BI{idx}]"
+
+ # Helper to encode bold
+ def bold_replacer(match):
+ text_content = match.group(1)
+ idx = len(bold_map)
+ bold_map.append(text_content)
+ return f"[B{idx}]{text_content}[B{idx}]"
+
+ # 1. Encode document references FIRST - Updated patterns to match your actual format
+ # Handle patterns like [\[9,](#page-5-9), [10,](#page-5-10), [11\]](#page-5-11), [\[12\]](#page-5-12), [\[13\]](#page-5-13)
+ text = re.sub(r"(\[\\?\[?\*?\d+[,\.]?\\?\]?\]\(#page-\d+-\d+\))", doc_ref_replacer, text)
+
+ # Also handle the original patterns from markdown version
+ text = re.sub(r"(\[(?:\\?\d+[,\.]? ?)+\]\(#page-\d+-\d+\))", doc_ref_replacer, text)
+ text = re.sub(r"(\[\d+[,\.]?\]\(#page-\d+-\d+\))", doc_ref_replacer, text)
+ text = re.sub(r"(\[\*\d+[,\.]?\]\(#page-\d+-\d+\))", doc_ref_replacer, text)
+
+ # 2. Encode links BEFORE formatting - handle complex nested structures
+ def find_and_replace_links(text):
+ offset = 0 # Track how much the text has shifted due to replacements
+
+ # Find all ]*>'
+ matches = list(re.finditer(pattern, text, re.IGNORECASE))
+
+ # Process from left to right
+ for match in matches:
+ start = match.start() + offset
+ href = match.group(1)
+
+ # Find the matching closing tag
+ tag_count = 1
+ pos = match.end() + offset
+ content_start = pos
+
+ while pos < len(text) and tag_count > 0:
+ # Look for opening tags
+ next_open = text.find(" tags
+ next_close = text.find("", pos)
+ next_close_case = text.find("", pos)
+ if next_close_case != -1 and (next_close == -1 or next_close_case < next_close):
+ next_close = next_close_case
+
+ if next_close == -1:
+ break
+
+ if next_open != -1 and next_open < next_close:
+ # Found nested opening tag
+ tag_count += 1
+ pos = next_open + 3
+ else:
+ # Found closing tag
+ tag_count -= 1
+ if tag_count == 0:
+ # This is our matching closing tag
+ content = text[content_start:next_close]
+ end = next_close + 4 # +4 for
+
+ # Replace the entire link
+ idx = len(link_map)
+ link_map.append((content, href))
+ replacement = f"[LINK{idx}]{content}[LINK{idx}]"
+
+ original_length = end - start
+ new_length = len(replacement)
+
+ text = text[:start] + replacement + text[end:]
+ offset += new_length - original_length
+ break
+ else:
+ pos = next_close + 4
+
+ return text
+
+ text = find_and_replace_links(text)
+
+ # 3. Encode bold+italic combinations BEFORE individual bold/italic
+ # Handle text and text - only simple cases without nested links
+ text = re.sub(r"([^<]+)", bold_italic_replacer, text, flags=re.IGNORECASE)
+ text = re.sub(r"([^<]+)", bold_italic_replacer, text, flags=re.IGNORECASE)
+
+ # 4. Encode bold (text) - only simple cases without nested tags
+ text = re.sub(r"([^<]+)", bold_replacer, text, flags=re.IGNORECASE)
+
+ # 5. Encode italic (text) - handle cases that might contain encoded links
+ def italic_with_links_replacer(match):
+ content = match.group(1)
+ idx = len(italic_map)
+ italic_map.append(content)
+ return f"[IT{idx}]{content}[IT{idx}]"
+
+ # Handle italic tags that might contain encoded links
+ text = re.sub(
+ r"([^<]*(?:\[LINK\d+\][^\[]*\[LINK\d+\][^<]*)*)", italic_with_links_replacer, text, flags=re.IGNORECASE
+ )
+ # Handle simple italic tags
+ text = re.sub(r"([^<]+)", italic_with_links_replacer, text, flags=re.IGNORECASE)
+
+ return text, link_map, doc_ref_map
diff --git a/src/adapters/infrastructure/translation/encode_markdown_content.py b/src/adapters/infrastructure/translation/encode_markdown_content.py
new file mode 100644
index 0000000..acaf2d9
--- /dev/null
+++ b/src/adapters/infrastructure/translation/encode_markdown_content.py
@@ -0,0 +1,80 @@
+import re
+
+
+def encode_markdown(text):
+ text = text.replace("_ _", " ")
+
+ link_map = []
+ doc_ref_map = []
+ bold_map = []
+ italic_map = []
+ bold_italic_map = []
+
+ # Helper to encode links with nested brackets in label
+ def link_replacer(match):
+ label = match.group(1)[1:-1] # Remove outer []
+ url = match.group(3)
+ idx = len(link_map)
+ link_map.append((label, url))
+ return f"[LINK{idx}]{label}[LINK{idx}]"
+
+ # Helper to encode doc refs - sequential numbering
+ def doc_ref_replacer(match):
+ idx = len(doc_ref_map)
+ doc_ref_map.append(match.group(0))
+ return f"[DOCREF{idx}]"
+
+ # Helper to encode bold+italic
+ def bold_italic_replacer(match):
+ text_content = match.group(1)
+ idx = len(bold_italic_map)
+ bold_italic_map.append(text_content)
+ return f"[BI{idx}]{text_content}[BI{idx}]"
+
+ # Helper to encode bold
+ def bold_replacer(match):
+ text_content = match.group(1)
+ idx = len(bold_map)
+ bold_map.append(text_content)
+ return f"[B{idx}]{text_content}[B{idx}]"
+
+ # Helper to encode italic
+ def italic_replacer(match):
+ text_content = match.group(1)
+ idx = len(italic_map)
+ italic_map.append(text_content)
+ return f"[IT{idx}]{text_content}[IT{idx}]"
+
+ # 1. Encode ALL document references in ONE PASS for sequential numbering
+ doc_ref_patterns = [
+ r"\[\\?\[\d+,\]\(#page-\d+-\d+\)", # [\[9,](#page-5-9)
+ r"\[\\?\[\d+\\?\]\]\(#page-\d+-\d+\)", # [\[12\]](#page-5-12)
+ r"\[\*\d+\]\(#page-\d+-\d+\)", # [*1221](#page-3-7)
+ r"\[\d+,\]\(#page-\d+-\d+\)", # [10,](#page-5-10)
+ r"\[\d+\\?\]\]\(#page-\d+-\d+\)", # [11\]](#page-5-11)
+ r"\[\d+\]\(#page-\d+-\d+\)", # [1221](#page-3-7)
+ ]
+
+ # Combine all patterns with alternation (|)
+ combined_pattern = "(" + "|".join(doc_ref_patterns) + ")"
+ text = re.sub(combined_pattern, doc_ref_replacer, text)
+
+ # 2. Encode links BEFORE formatting to avoid matching underscores in URLs
+ link_pattern = re.compile(r"(\[((?:[^\[\]]+|\[[^\[\]]*\])*)\])\((https?://[^\)]+)\)")
+ while True:
+ new_text = link_pattern.sub(lambda m: link_replacer(m), text)
+ if new_text == text:
+ break
+ text = new_text
+
+ # 3. Encode bold+italic BEFORE individual bold/italic
+ text = re.sub(r"\*\*\_([^\*_]+)\_\*\*", bold_italic_replacer, text)
+ text = re.sub(r"_\*([^\*_]+)\*_", bold_italic_replacer, text)
+
+ # 4. Encode bold
+ text = re.sub(r"\*\*([^\*]+)\*\*", bold_replacer, text)
+
+ # 5. Encode italic
+ text = re.sub(r"\_([^\_]+)\_", italic_replacer, text)
+
+ return text, link_map, doc_ref_map
diff --git a/src/adapters/infrastructure/translation/ollama_container_manager.py b/src/adapters/infrastructure/translation/ollama_container_manager.py
new file mode 100644
index 0000000..4624b67
--- /dev/null
+++ b/src/adapters/infrastructure/translation/ollama_container_manager.py
@@ -0,0 +1,135 @@
+import os
+import time
+import requests
+import json
+from typing import Optional, Any
+from configuration import service_logger
+
+
+class OllamaContainerManager:
+
+ def __init__(self, ollama_host: str = None):
+ self.ollama_host = ollama_host or os.getenv("OLLAMA_HOST", "http://ollama:11434")
+ self.api_base_url = f"{self.ollama_host}/api"
+ self.timeout = 600
+ self.max_retries = 5
+
+ def is_ollama_available(self) -> bool:
+ try:
+ response = requests.get(f"{self.api_base_url}/tags", timeout=10)
+ return response.status_code == 200
+ except Exception as e:
+ service_logger.debug(f"Ollama availability check failed: {e}")
+ return False
+
+ def ensure_model_available(self, model_name: str) -> bool:
+ try:
+ if self._is_model_available(model_name):
+ service_logger.info(f"\033[92mModel '{model_name}' is available\033[0m")
+ return True
+
+ service_logger.info(f"\033[93mModel '{model_name}' not found. Downloading...\033[0m")
+ return self._download_model(model_name)
+
+ except Exception as e:
+ service_logger.error(f"Error ensuring model availability: {e}")
+ return False
+
+ def _is_model_available(self, model_name: str) -> bool:
+ try:
+ response = requests.get(f"{self.api_base_url}/tags", timeout=10)
+ if response.status_code != 200:
+ return False
+
+ models_data = response.json()
+ available_models = [model["name"] for model in models_data.get("models", [])]
+
+ model_variants = {model_name, f"{model_name}:latest", model_name.split(":")[0]}
+ return any(variant in available_models for variant in model_variants)
+
+ except Exception as e:
+ service_logger.error(f"Error checking model availability: {e}")
+ return False
+
+ def _download_model(self, model_name: str) -> bool:
+ try:
+ response = requests.post(f"{self.api_base_url}/pull", json={"name": model_name}, stream=True)
+
+ if response.status_code != 200:
+ service_logger.error(f"Failed to start model download: {response.text}")
+ return False
+
+ for idx, line in enumerate(response.iter_lines()):
+ if line:
+ try:
+ data = json.loads(line)
+ if "status" in data and idx % 100 == 0:
+ service_logger.info(f"Model download: {data['status']}")
+ if data.get("status") == "success":
+ service_logger.info(f"Model '{model_name}' downloaded successfully")
+ return True
+ except json.JSONDecodeError:
+ continue
+
+ return True
+
+ except Exception as e:
+ service_logger.error(f"Error downloading model '{model_name}': {e}")
+ return False
+
+ def chat_with_timeout(
+ self, model: str, messages: list[dict], source_markup: str, timeout: Optional[int] = None
+ ) -> dict[str, Any] | str:
+ timeout = timeout or self.timeout
+
+ for attempt in range(self.max_retries + 1):
+ try:
+ if attempt > 0:
+ service_logger.info(f"Retrying chat request (attempt {attempt + 1}/{self.max_retries + 1})")
+ time.sleep(10)
+
+ return self._make_chat_request(
+ model,
+ messages,
+ timeout,
+ )
+
+ except requests.exceptions.Timeout:
+ service_logger.warning(f"Chat request timed out after {timeout} seconds (attempt {attempt})")
+ if attempt < self.max_retries:
+ continue
+ else:
+ service_logger.error(f"Chat request failed after {self.max_retries} attempts due to timeout")
+ return source_markup
+
+ except Exception as e:
+ service_logger.error(f"Chat request failed (attempt {attempt}): {e}")
+ if attempt < self.max_retries:
+ continue
+ else:
+ service_logger.error(f"Chat request failed after {self.max_retries} attempts")
+ return source_markup
+
+ return source_markup
+
+ def _make_chat_request(self, model: str, messages: list, timeout: int) -> dict[str, Any]:
+ payload = {"model": model, "messages": messages, "stream": False}
+
+ response = requests.post(f"{self.api_base_url}/chat", json=payload, timeout=timeout)
+
+ if response.status_code != 200:
+ raise Exception(f"Chat request failed with status {response.status_code}: {response.text}")
+
+ return response.json()
+
+ def ensure_service_ready(self, model_name: str) -> bool:
+ try:
+ if not self.is_ollama_available():
+ service_logger.error("Ollama service is not available. Make sure the Ollama container is running.")
+ return False
+
+ return self.ensure_model_available(model_name)
+
+ except Exception as e:
+ service_logger.error(f"Error ensuring service readiness: {e}")
+ return False
diff --git a/src/adapters/infrastructure/translation/translate_markup_document.py b/src/adapters/infrastructure/translation/translate_markup_document.py
new file mode 100644
index 0000000..fa73c02
--- /dev/null
+++ b/src/adapters/infrastructure/translation/translate_markup_document.py
@@ -0,0 +1,180 @@
+from adapters.infrastructure.markup_conversion.OutputFormat import OutputFormat
+from adapters.infrastructure.translation.decode_html_content import decode_html
+from adapters.infrastructure.translation.decode_markdown_content import decode_markdown
+from adapters.infrastructure.translation.encode_html_content import encode_html
+from adapters.infrastructure.translation.encode_markdown_content import encode_markdown
+from adapters.infrastructure.translation.ollama_container_manager import OllamaContainerManager
+from configuration import service_logger
+from domain.SegmentBox import SegmentBox
+from pdf_token_type_labels.TokenType import TokenType
+from tqdm import tqdm
+
+prompt = """You are a professional translator. Your task is to translate the following text to {target_language}.
+
+**CRITICAL: You must output ONLY the {target_language} translation. Do NOT repeat the source text.**
+
+
+Follow these guidelines:
+
+1. Translate all text accurately without omitting any part of the content.
+2. Preserve the tone and style of the original text.
+3. Do not change, remove, or add any markdown symbols (such as *, _, #, [ ], ( ), -, or backticks). Only translate the visible text.
+4. Do not translate person names, URLs, email addresses, or code snippets. Only translate the human-readable text.
+5. Make sure that you are returning the translation, not the source text.
+6. If you see custom tags (such as [DOC_REF], [IT], [B], [LINK]), **translate the text inside the tags, but do not change, remove, or translate the tags themselves.** The tags must appear in the same positions in the output as in the input.
+7. If a word is split with a hyphen (e.g., "sec- onds"), treat it as a single word, get rid of the hyphen and translate it as one complete word in the target language.
+8. Do not include any additional comments, notes, or explanations in the output; provide only the translated text.
+
+**IMPORTANT: The text between the backticks below is the source text. You must output the {target_language} translation, NOT the source.**
+
+Here is the text to be translated:
+
+```
+{text_to_translate}
+```
+"""
+
+
+def get_translation(ollama_manager: OllamaContainerManager, model: str, content: str, source_markup: str) -> str:
+ response = ollama_manager.chat_with_timeout(
+ model=model, messages=[{"role": "user", "content": content}], source_markup=source_markup
+ )
+
+ if response is None:
+ raise Exception("Translation request failed or timed out")
+
+ if isinstance(response, str):
+ return response
+
+ return response["message"]["content"].replace("```", "").strip()
+
+
+def get_table_of_contents(vgt_segments: list[SegmentBox]) -> str:
+ title_segments = [s for s in vgt_segments if s.type in {TokenType.TITLE, TokenType.SECTION_HEADER}]
+ table_of_contents = "# Table of Contents\n\n"
+ for segment in title_segments:
+ if not segment.text.strip():
+ continue
+ first_word = segment.text.split()[0]
+ indentation = max(0, first_word.count(".") - 1)
+ content = " " * indentation + "- [" + segment.text + "](#" + segment.id + ")\n"
+ table_of_contents += content
+ table_of_contents += "\n"
+ return table_of_contents + "\n\n"
+
+
+def translate_markdown(
+ ollama_manager: OllamaContainerManager,
+ segments: list[SegmentBox],
+ markdown_parts: list[str],
+ model: str,
+ target_language: str,
+ extract_toc: bool = False,
+) -> str:
+ translated_markdown_parts: list[str] = []
+ title_segments = []
+ if extract_toc:
+ markdown_parts = markdown_parts[1:]
+ ten_percent_of_segments = len(markdown_parts) // 10
+ service_logger.info(f"Starting translation of {len(markdown_parts)} segments")
+ for index, markdown_part in tqdm(enumerate(markdown_parts), total=len(markdown_parts), desc="Translating markdown"):
+ if index % ten_percent_of_segments == 0:
+ service_logger.info("")
+ markdown_part = markdown_part.strip()
+ if not markdown_part:
+ continue
+ if segments[index].type == TokenType.PICTURE:
+ translated_markdown_parts.append(markdown_part)
+ continue
+ if segments[index].type == TokenType.TABLE:
+ anchor, table_html = markdown_part.split("\n", 1)
+ content = prompt.format(target_language=target_language, text_to_translate=table_html)
+ response = get_translation(ollama_manager, model, content, markdown_part)
+ translated_markdown_parts.append(anchor + "\n" + response)
+ continue
+ if segments[index].type in {TokenType.TITLE, TokenType.SECTION_HEADER}:
+ anchor, text = markdown_part.split("\n")
+ content = prompt.format(target_language=target_language, text_to_translate=text)
+ response = get_translation(ollama_manager, model, content, markdown_part)
+ translated_markdown_parts.append(anchor + "\n" + response)
+ if extract_toc:
+ title_segments.append(segments[index])
+ title_segments[-1].text = response.replace("#", "").strip()
+ continue
+ if segments[index].type == TokenType.FORMULA:
+ translated_markdown_parts.append(markdown_part)
+ continue
+ encoded_text, link_map, doc_ref_map = encode_markdown(markdown_part)
+ content = prompt.format(target_language=target_language, text_to_translate=encoded_text)
+ response = get_translation(ollama_manager, model, content, markdown_part)
+ translated_markdown_parts.append(decode_markdown(response, link_map, doc_ref_map))
+ service_logger.info("\033[92mTranslation of markdown segments completed\033[0m")
+ if extract_toc:
+ translated_markdown_parts.insert(0, get_table_of_contents(title_segments))
+ return "\n\n".join(translated_markdown_parts)
+
+
+def translate_html(
+ ollama_manager: OllamaContainerManager,
+ segments: list[SegmentBox],
+ html_parts: list[str],
+ model: str,
+ target_language: str,
+ extract_toc: bool = False,
+) -> str:
+ translated_html_parts: list[str] = []
+ title_segments = []
+ if extract_toc:
+ html_parts = html_parts[1:]
+ ten_percent_of_segments = len(html_parts) // 10
+ service_logger.info(f"Starting translation of {len(html_parts)} segments")
+ for index, html_part in tqdm(enumerate(html_parts), total=len(html_parts), desc="Translating html"):
+ if index % ten_percent_of_segments == 0:
+ service_logger.info("")
+ html_part = html_part.strip()
+ if not html_part:
+ continue
+ if segments[index].type == TokenType.PICTURE:
+ translated_html_parts.append(html_part)
+ continue
+ if segments[index].type == TokenType.TABLE:
+ anchor, table_html = html_part.split("\n", 1)
+ content = prompt.format(target_language=target_language, text_to_translate=table_html)
+ response = get_translation(ollama_manager, model, content, html_part)
+ translated_html_parts.append(anchor + "\n" + response)
+ continue
+ if segments[index].type in {TokenType.TITLE, TokenType.SECTION_HEADER}:
+ anchor, text = html_part.split("\n")
+ content = prompt.format(target_language=target_language, text_to_translate=text)
+ response = get_translation(ollama_manager, model, content, html_part)
+ translated_html_parts.append(anchor + "\n" + response)
+ if extract_toc:
+ title_segments.append(segments[index])
+ title_segments[-1].text = response.replace("#", "").strip()
+ continue
+ if segments[index].type == TokenType.FORMULA:
+ translated_html_parts.append(html_part)
+ continue
+ encoded_text, link_map, doc_ref_map = encode_html(html_part)
+ content = prompt.format(target_language=target_language, text_to_translate=encoded_text)
+ response = get_translation(ollama_manager, model, content, html_part)
+ translated_html_parts.append(decode_html(response, link_map, doc_ref_map))
+ service_logger.info("\033[92mTranslation of html segments completed\033[0m")
+ if extract_toc:
+ translated_html_parts.insert(0, get_table_of_contents(title_segments))
+ return "\n\n".join(translated_html_parts)
+
+
+def translate_markup(
+ ollama_manager: OllamaContainerManager,
+ output_format: OutputFormat,
+ segments: list[SegmentBox],
+ markup_parts: list[str],
+ model: str,
+ target_language: str,
+ extract_toc: bool = False,
+) -> str:
+ if output_format == OutputFormat.MARKDOWN:
+ return translate_markdown(ollama_manager, segments, markup_parts, model, target_language, extract_toc)
+ else:
+ return translate_html(ollama_manager, segments, markup_parts, model, target_language, extract_toc)
diff --git a/src/adapters/web/fastapi_controllers.py b/src/adapters/web/fastapi_controllers.py
index faaf8f8..6aaa11d 100644
--- a/src/adapters/web/fastapi_controllers.py
+++ b/src/adapters/web/fastapi_controllers.py
@@ -92,7 +92,13 @@ async def convert_to_markdown_endpoint(
extract_toc: bool = Form(False),
dpi: int = Form(120),
output_file: Optional[str] = Form(None),
+ target_languages: Optional[str] = Form(None),
+ translation_model: str = Form("gpt-oss"),
) -> Union[str, Response]:
+ target_languages_list = None
+ if target_languages:
+ target_languages_list = [lang.strip() for lang in target_languages.split(",") if lang.strip()]
+
return await run_in_threadpool(
self.convert_to_markdown_use_case.execute,
file.file.read(),
@@ -100,6 +106,8 @@ async def convert_to_markdown_endpoint(
extract_toc,
dpi,
output_file,
+ target_languages_list,
+ translation_model,
)
async def convert_to_html_endpoint(
@@ -109,7 +117,13 @@ async def convert_to_html_endpoint(
extract_toc: bool = Form(False),
dpi: int = Form(120),
output_file: Optional[str] = Form(None),
+ target_languages: Optional[str] = Form(None),
+ translation_model: str = Form("gpt-oss"),
) -> Union[str, Response]:
+ target_languages_list = None
+ if target_languages:
+ target_languages_list = [lang.strip() for lang in target_languages.split(",") if lang.strip()]
+
return await run_in_threadpool(
self.convert_to_html_use_case.execute,
file.file.read(),
@@ -117,4 +131,6 @@ async def convert_to_html_endpoint(
extract_toc,
dpi,
output_file,
+ target_languages_list,
+ translation_model,
)
diff --git a/src/ports/services/html_conversion_service.py b/src/ports/services/html_conversion_service.py
index 0b23d3b..021c733 100644
--- a/src/ports/services/html_conversion_service.py
+++ b/src/ports/services/html_conversion_service.py
@@ -14,5 +14,7 @@ def convert_to_html(
extract_toc: bool = False,
dpi: int = 120,
output_file: Optional[str] = None,
+ target_languages: Optional[list[str]] = None,
+ translation_model: str = "gpt-oss",
) -> Union[str, Response]:
pass
diff --git a/src/ports/services/markdown_conversion_service.py b/src/ports/services/markdown_conversion_service.py
index 1b48aea..baf8295 100644
--- a/src/ports/services/markdown_conversion_service.py
+++ b/src/ports/services/markdown_conversion_service.py
@@ -14,5 +14,7 @@ def convert_to_markdown(
extract_toc: bool = False,
dpi: int = 120,
output_file: Optional[str] = None,
+ target_languages: Optional[list[str]] = None,
+ translation_model: str = "gpt-oss",
) -> Union[str, Response]:
pass
diff --git a/src/use_cases/html_conversion/convert_to_html_use_case.py b/src/use_cases/html_conversion/convert_to_html_use_case.py
index f620d7e..044c7e3 100644
--- a/src/use_cases/html_conversion/convert_to_html_use_case.py
+++ b/src/use_cases/html_conversion/convert_to_html_use_case.py
@@ -21,6 +21,8 @@ def execute(
extract_toc: bool = False,
dpi: int = 120,
output_file: Optional[str] = None,
+ target_languages: Optional[list[str]] = None,
+ translation_model: str = "gpt-oss",
) -> Union[str, Response]:
if use_fast_mode:
analysis_result = self.pdf_analysis_service.analyze_pdf_layout_fast(pdf_content, "", True, False)
@@ -45,4 +47,6 @@ def execute(
elif isinstance(item, SegmentBox):
segments.append(item)
- return self.html_conversion_service.convert_to_html(pdf_content, segments, extract_toc, dpi, output_file)
+ return self.html_conversion_service.convert_to_html(
+ pdf_content, segments, extract_toc, dpi, output_file, target_languages, translation_model
+ )
diff --git a/src/use_cases/markdown_conversion/convert_to_markdown_use_case.py b/src/use_cases/markdown_conversion/convert_to_markdown_use_case.py
index 71329bc..33fd2db 100644
--- a/src/use_cases/markdown_conversion/convert_to_markdown_use_case.py
+++ b/src/use_cases/markdown_conversion/convert_to_markdown_use_case.py
@@ -21,6 +21,8 @@ def execute(
extract_toc: bool = False,
dpi: int = 120,
output_file: Optional[str] = None,
+ target_languages: Optional[list[str]] = None,
+ translation_model: str = "gpt-oss",
) -> Union[str, Response]:
if use_fast_mode:
analysis_result = self.pdf_analysis_service.analyze_pdf_layout_fast(pdf_content, "", True, False)
@@ -45,4 +47,6 @@ def execute(
elif isinstance(item, SegmentBox):
segments.append(item)
- return self.markdown_conversion_service.convert_to_markdown(pdf_content, segments, extract_toc, dpi, output_file)
+ return self.markdown_conversion_service.convert_to_markdown(
+ pdf_content, segments, extract_toc, dpi, output_file, target_languages, translation_model
+ )