Skip to content

Commit c309f79

Browse files
authored
Merge pull request #126 from huridocs/translation
Add translation support to translate documents into different languages
2 parents 167bf10 + 180cb9d commit c309f79

23 files changed

+1181
-33
lines changed

Dockerfile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ FROM pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
22
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
33

44
RUN apt-get update
5-
RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc
5+
RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc curl
66

77

88
RUN apt-get install -y ocrmypdf
@@ -52,4 +52,3 @@ RUN python src/download_models.py
5252
ENV PYTHONPATH "${PYTHONPATH}:/app/src"
5353
ENV TRANSFORMERS_VERBOSITY=error
5454
ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
55-

Dockerfile.ollama

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
FROM ollama/ollama:latest
2+
3+
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
4+
5+
ENV OLLAMA_HOST=0.0.0.0:11434
6+
7+
EXPOSE 11434
8+
9+
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
10+
CMD curl -f http://localhost:11434/api/tags || exit 1

Makefile

Lines changed: 111 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,35 @@
11
HAS_GPU := $(shell command -v nvidia-smi > /dev/null && echo 1 || echo 0)
22

3+
help:
4+
@echo "PDF Document Layout Analysis - Available Commands:"
5+
@echo ""
6+
@echo "📄 Standard PDF Analysis (main app only):"
7+
@echo " make start - Auto-detects GPU, starts main app only"
8+
@echo " make start_no_gpu - Forces CPU mode, starts main app only"
9+
@echo " make start_detached - Background mode, main app only (CPU)"
10+
@echo " make start_detached_gpu - Background mode, main app only (GPU)"
11+
@echo ""
12+
@echo "🌐 With Translation Features (includes Ollama):"
13+
@echo " make start_translation - Auto-detects GPU, includes Ollama"
14+
@echo " make start_translation_no_gpu - Forces CPU mode, includes Ollama"
15+
@echo ""
16+
@echo "🧪 Testing & Utilities:"
17+
@echo " make test - Run Python tests"
18+
@echo " make stop - Stop all services"
19+
@echo ""
20+
@echo "🔧 Development:"
21+
@echo " make install_venv - Create virtual environment"
22+
@echo " make install - Install dependencies"
23+
@echo " make formatter - Format code with black"
24+
@echo " make check_format - Check code formatting"
25+
@echo ""
26+
@echo "🧹 Cleanup:"
27+
@echo " make remove_docker_containers - Remove Docker containers"
28+
@echo " make remove_docker_images - Remove Docker images"
29+
@echo " make free_up_space - Free up system space"
30+
@echo ""
31+
@echo "💡 Tip: Use 'make start' for basic PDF analysis, 'make start_translation' for translation features"
32+
333
install:
434
. .venv/bin/activate; pip install -Ur requirements.txt
535

@@ -31,19 +61,88 @@ else
3161
endif
3262
ifeq ($(HAS_GPU), 1)
3363
@echo "NVIDIA GPU detected, using docker-compose-gpu.yml"
34-
docker compose -f docker-compose-gpu.yml up --build
64+
docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu
3565
else
3666
@echo "No NVIDIA GPU detected, using docker-compose.yml"
37-
docker compose -f docker-compose.yml up --build
67+
docker compose -f docker-compose.yml up --build pdf-document-layout-analysis
3868
endif
3969

4070

4171
start_no_gpu:
4272
mkdir -p ./models
43-
docker compose up --build
73+
@echo "Starting with CPU-only configuration"
74+
docker compose up --build pdf-document-layout-analysis
75+
76+
start_translation:
77+
ifeq ($(OS), Windows_NT)
78+
if not exist models mkdir models
79+
else
80+
mkdir -p ./models
81+
endif
82+
ifeq ($(HAS_GPU), 1)
83+
@echo "NVIDIA GPU detected, starting with translation support (GPU-enabled Ollama)"
84+
@echo "Starting Ollama GPU container first..."
85+
docker compose -f docker-compose-gpu.yml up -d ollama-gpu
86+
@echo "Waiting for Ollama to be healthy..."
87+
@timeout=60; while [ $$timeout -gt 0 ]; do \
88+
if docker inspect --format='{{.State.Health.Status}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then \
89+
echo "Ollama GPU container is healthy!"; \
90+
break; \
91+
fi; \
92+
echo "Waiting for Ollama GPU container to be healthy... ($$timeout seconds remaining)"; \
93+
sleep 5; \
94+
timeout=$$((timeout-5)); \
95+
done
96+
@if ! docker inspect --format='{{.State.Health.Status}}' ollama-service-gpu 2>/dev/null | grep -q "healthy"; then \
97+
echo "Warning: Ollama GPU container may not be fully healthy yet, but continuing..."; \
98+
fi
99+
@echo "Starting all services with translation support..."
100+
docker compose -f docker-compose-gpu.yml up --build pdf-document-layout-analysis-gpu-translation
101+
else
102+
@echo "No NVIDIA GPU detected, starting with translation support (CPU Ollama)"
103+
@echo "Starting Ollama container first..."
104+
docker compose -f docker-compose.yml up -d ollama
105+
@echo "Waiting for Ollama to be healthy..."
106+
@timeout=60; while [ $$timeout -gt 0 ]; do \
107+
if docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
108+
echo "Ollama container is healthy!"; \
109+
break; \
110+
fi; \
111+
echo "Waiting for Ollama container to be healthy... ($$timeout seconds remaining)"; \
112+
sleep 5; \
113+
timeout=$$((timeout-5)); \
114+
done
115+
@if ! docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
116+
echo "Warning: Ollama container may not be fully healthy yet, but continuing..."; \
117+
fi
118+
@echo "Starting all services with translation support..."
119+
docker compose -f docker-compose.yml up --build pdf-document-layout-analysis-translation
120+
endif
121+
122+
start_translation_no_gpu:
123+
mkdir -p ./models
124+
@echo "Starting with CPU-only configuration and translation support"
125+
@echo "Starting Ollama container first..."
126+
docker compose up -d ollama
127+
@echo "Waiting for Ollama to be healthy..."
128+
@timeout=60; while [ $$timeout -gt 0 ]; do \
129+
if docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
130+
echo "Ollama container is healthy!"; \
131+
break; \
132+
fi; \
133+
echo "Waiting for Ollama container to be healthy... ($$timeout seconds remaining)"; \
134+
sleep 5; \
135+
timeout=$$((timeout-5)); \
136+
done
137+
@if ! docker inspect --format='{{.State.Health.Status}}' ollama-service 2>/dev/null | grep -q "healthy"; then \
138+
echo "Warning: Ollama container may not be fully healthy yet, but continuing..."; \
139+
fi
140+
@echo "Starting all services with translation support..."
141+
docker compose up --build pdf-document-layout-analysis-translation
44142

45143
stop:
46144
docker compose stop
145+
docker compose -f docker-compose-gpu.yml stop
47146

48147
test:
49148
. .venv/bin/activate; command cd src; command python -m pytest
@@ -68,11 +167,18 @@ free_up_space:
68167

69168
start_detached:
70169
mkdir -p ./models
71-
docker compose up --build -d
170+
@echo "Starting in detached mode"
171+
docker compose up --build -d pdf-document-layout-analysis
172+
@echo "Main application started in background. Check status with: docker compose ps"
173+
@echo "View logs with: docker compose logs -f pdf-document-layout-analysis"
72174

73175
start_detached_gpu:
74176
mkdir -p ./models
75-
RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d
177+
@echo "Starting in detached mode with GPU"
178+
RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d pdf-document-layout-analysis-gpu
179+
@echo "Main application started in background. Check status with: docker compose ps"
180+
@echo "View logs with: docker compose logs -f pdf-document-layout-analysis-gpu"
181+
76182

77183
upgrade:
78184
. .venv/bin/activate; pip-upgrade

README.md

Lines changed: 77 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,18 +66,23 @@ This project provides a powerful and flexible PDF analysis microservice built wi
6666

6767
### 1. Start the Service
6868

69-
**With GPU support (recommended for better performance):**
69+
**Standard PDF Analysis (recommended for most users):**
7070
```bash
7171
make start
7272
```
7373

74-
**Without GPU support:**
74+
**With Translation Features (includes Ollama container):**
7575
```bash
76-
make start_no_gpu
76+
make start_translation
7777
```
7878

7979
The service will be available at `http://localhost:5060`
8080

81+
**See all available commands:**
82+
```bash
83+
make help
84+
```
85+
8186
**Check service status:**
8287

8388
```bash
@@ -170,8 +175,8 @@ The service provides a comprehensive RESTful API with the following endpoints:
170175

171176
| Endpoint | Method | Description | Parameters |
172177
|----------|--------|-------------|------------|
173-
| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
174-
| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
178+
| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file`, `target_languages`, `translation_model` |
179+
| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file`, `target_languages`, `translation_model` |
175180
| `/visualize` | POST | Visualize segmentation results on the PDF | `file`, `fast` |
176181

177182
### OCR & Utility Endpoints
@@ -192,6 +197,8 @@ The service provides a comprehensive RESTful API with the following endpoints:
192197
- **`types`**: Comma-separated content types to extract (string, default: "all")
193198
- **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
194199
- **`dpi`**: Image resolution for conversion (integer, default: 120)
200+
- **`target_languages`**: Comma-separated list of target languages for translation (e.g. "Turkish, Spanish, French")
201+
- **`translation_model`**: Ollama model to use for translation (string, default: "gpt-oss")
195202

196203
## 💡 Usage Examples
197204

@@ -254,15 +261,75 @@ curl -X POST http://localhost:5060/markdown \
254261
curl -X POST http://localhost:5060/html \
255262
-F 'file=@document.pdf' \
256263
-F 'extract_toc=true' \
257-
-F 'output_file=document.html' \
264+
-F 'output_file=document.md' \
265+
--output 'document.zip'
266+
```
267+
268+
**Convert to Markdown with Translation:**
269+
```bash
270+
curl -X POST http://localhost:5060/markdown \
271+
-F 'file=@document.pdf' \
272+
-F 'output_file=document.md' \
273+
-F 'target_languages=Turkish, Spanish' \
274+
-F 'translation_model=gpt-oss' \
275+
--output 'document.zip'
276+
```
277+
278+
**Convert to HTML with Translation:**
279+
```bash
280+
curl -X POST http://localhost:5060/html \
281+
-F 'file=@document.pdf' \
282+
-F 'output_file=document.md' \
283+
-F 'target_languages=French, Russian' \
284+
-F 'translation_model=huihui_ai/hunyuan-mt-abliterated' \
258285
--output 'document.zip'
259286
```
260287

261-
> **📋 Segmentation Data**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains a `{filename}_segmentation.json` file with information about each detected document segment including:
262-
> - **Coordinates**: `left`, `top`, `width`, `height`
263-
> - **Page information**: `page_number`, `page_width`, `page_height`
264-
> - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
288+
> **📋 Segmentation Data & Translations**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains:
289+
> - **Original file**: The converted document in the requested format
290+
> - **Segmentation data**: `{filename}_segmentation.json` file with information about each detected document segment:
291+
> - **Coordinates**: `left`, `top`, `width`, `height`
292+
> - **Page information**: `page_number`, `page_width`, `page_height`
293+
> - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
294+
> - **Translated files** (if `target_languages` specified): `{filename}_{language}.{extension}` for each target language
295+
> - **Images** (if present): `{filename}_pictures/` directory containing extracted images
296+
297+
### Translation Features
298+
299+
The `/markdown` and `/html` endpoints support automatic translation of the converted content into multiple languages using Ollama models.
265300

301+
**Translation Requirements:**
302+
- The specified translation model must be available in Ollama
303+
- An `output_file` must be specified (translations are only included in zip responses)
304+
305+
**Supported Translation Models:**
306+
- Any Ollama-compatible model (e.g., `gpt-oss`, `llama2`, `mistral`, etc.)
307+
- Models are automatically downloaded if not present locally
308+
309+
**Translation Process:**
310+
1. The service checks if the specified model is available in Ollama
311+
2. If not available, it attempts to download the model using `ollama pull`
312+
3. For each target language, the content is translated while preserving:
313+
- Original formatting and structure
314+
- Markdown/HTML syntax
315+
- Links and references
316+
- Image references and tables
317+
4. Translated files are named: `{filename}_{language}.{extension}`
318+
319+
_**Note that the quality of translations mostly depends on the models used. When using smaller models, the output may contain many unexpected or undesired elements. For regular users, we aimed for a balance between performance and quality, so we tested with different models with a reasonable size. The results for `gpt-oss` were satisfactory, which is why we set it as the default model. If you need something smaller you can also try `huihui_ai/hunyuan-mt-abliterated`, we saw it gives decent results especially if the text does not have much styling.**_
320+
321+
**Example Translation Output:**
322+
```
323+
document.zip
324+
├── document.md # Source text with markdown/html styling
325+
├── document_Spanish.md # Spanish translation
326+
├── document_French.md # French translation
327+
├── document_Turkish.md # Turkish translation
328+
├── document_segmentation.json # Segmentation information
329+
└── document_pictures/ # (if images present)
330+
├── document_1_1.png
331+
└── document_1_2.png
332+
```
266333

267334
### OCR Processing
268335

docker-compose-gpu.yml

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,53 @@
11
services:
2-
pdf-document-layout-analysis-gpu:
2+
ollama-gpu:
33
extends:
44
file: docker-compose.yml
5-
service: pdf-document-layout-analysis
5+
service: ollama
6+
container_name: ollama-service-gpu
7+
deploy:
8+
resources:
9+
reservations:
10+
devices:
11+
- driver: nvidia
12+
count: 1
13+
capabilities: [ gpu ]
14+
environment:
15+
- NVIDIA_VISIBLE_DEVICES=all
16+
17+
pdf-document-layout-analysis-gpu:
18+
container_name: pdf-document-layout-analysis-gpu
19+
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
20+
init: true
21+
restart: unless-stopped
22+
build:
23+
context: .
24+
dockerfile: Dockerfile
25+
ports:
26+
- "5060:5060"
27+
deploy:
28+
resources:
29+
reservations:
30+
devices:
31+
- driver: nvidia
32+
count: 1
33+
capabilities: [ gpu ]
34+
environment:
35+
- RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
36+
- OLLAMA_HOST=http://localhost:11434
37+
38+
pdf-document-layout-analysis-gpu-translation:
39+
container_name: pdf-document-layout-analysis-gpu-translation
40+
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
41+
init: true
42+
restart: unless-stopped
43+
build:
44+
context: .
45+
dockerfile: Dockerfile
46+
ports:
47+
- "5060:5060"
48+
depends_on:
49+
ollama-gpu:
50+
condition: service_healthy
651
deploy:
752
resources:
853
reservations:
@@ -11,4 +56,11 @@ services:
1156
count: 1
1257
capabilities: [ gpu ]
1358
environment:
14-
- RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
59+
- RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
60+
- OLLAMA_HOST=http://ollama-gpu:11434
61+
networks:
62+
- pdf-analysis-network
63+
64+
networks:
65+
pdf-analysis-network:
66+
driver: bridge

0 commit comments

Comments
 (0)