Skip to content

Commit 2e070a2

Browse files
committed
[owl] VLM OCR Documents Parsing (#878)
* VLM OCR Implementation * Allow user specified VLM model * Allow vlm ocr toggle and model setting in env * Updates PDF loader configs * Fix uvicorn version * Configure Docker daemon * Support uvicorn 0.40.0 * Mirror gotenberg to ghcr * Updates gunicorn hook * Remove redundant retry logic
1 parent 1871541 commit 2e070a2

File tree

14 files changed

+661
-47
lines changed

14 files changed

+661
-47
lines changed

.env.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ JAMAI_API_BASE=http://localhost:6969/api
4444
# OWL_TEST_LLM_API_BASE=http://localhost:6970/v1
4545
# OWL_S3_ENDPOINT=http://localhost:9000
4646
# OWL_FILE_PROXY_URL=website-url
47+
# OWL_GOTENBERG_URL=http://localhost:3001
4748

4849
# Configuration
4950
OWL_PORT=6969
@@ -54,6 +55,10 @@ PB_MAX_CLIENT_CONN=500
5455
PB_MAX_CLIENT_CONN=80
5556
PG_MAX_CONNECTIONS=100
5657

58+
# VLM OCR Config
59+
OWL_USE_VLM_OCR=true
60+
# OWL_VLM_MODEL_ID=
61+
5762
# Frontend config
5863
HOST=localhost
5964
ORIGIN=http://localhost:4000

.github/workflows/ci.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,12 @@ jobs:
145145
OWL_STRIPE_WEBHOOK_SECRET_TEST: ${{ secrets.OWL_STRIPE_WEBHOOK_SECRET_TEST }}
146146
OWL_STRIPE_PUBLISHABLE_KEY_TEST: ${{ secrets.OWL_STRIPE_PUBLISHABLE_KEY_TEST }}
147147

148-
- name: Configure Docker timeout
148+
- name: Configure Docker daemon
149149
run: |
150-
mkdir -p ~/.docker
151-
echo '{"max-concurrent-downloads": 3, "max-download-attempts": 5}' > ~/.docker/config.json
150+
DAEMON_JSON=/etc/docker/daemon.json
151+
EXISTING=$(sudo cat "$DAEMON_JSON" 2>/dev/null || echo '{}')
152+
echo "$EXISTING" | jq '. + {"max-concurrent-downloads": 3, "max-download-attempts": 5}' | sudo tee "$DAEMON_JSON"
153+
sudo systemctl restart docker
152154
153155
- name: Launch services
154156
id: launch_services
File renamed without changes.

CLAUDE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Always read AGENTS.md and CONVENTIONS-BACKEND.md for guidelines.

docker/compose.base.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,19 @@ services:
215215
networks:
216216
- jamai
217217

218+
gotenberg:
219+
image: ghcr.io/embeddedllm/gotenberg/gotenberg:8
220+
healthcheck:
221+
test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
222+
interval: 30s
223+
timeout: 5s
224+
retries: 5
225+
start_period: 20s
226+
ports:
227+
- "3001:3000"
228+
networks:
229+
- jamai
230+
218231
owl:
219232
build:
220233
context: ..
@@ -254,6 +267,8 @@ services:
254267
condition: service_healthy
255268
docling:
256269
condition: service_healthy
270+
gotenberg:
271+
condition: service_healthy
257272
healthcheck:
258273
test: ["CMD-SHELL", "curl --fail localhost:6969/api/health || exit 1"]
259274
interval: 10s

services/api/pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ source = ["owl"]
3232
relative_files = true
3333
concurrency = ["multiprocessing", "thread", "greenlet"]
3434
parallel = true
35+
sigterm = true
3536

3637
[tool.coverage.paths]
3738
source = ["src", "api/src"]
@@ -130,6 +131,7 @@ dependencies = [
130131
"email-validator~=2.2.0",
131132
"fastapi[standard]~=0.115.0",
132133
"flower~=2.0.0",
134+
"griffe~=1.14.0",
133135
"gunicorn~=22.0.0",
134136
"httpx~=0.27",
135137
"itsdangerous~=2.2.0",
@@ -188,6 +190,7 @@ dependencies = [
188190
"pydantic-extra-types~=2.10.0",
189191
"pydantic-settings~=2.10.0",
190192
"pydantic[email,timezone]~=2.11.0", # 2.12 causes issues with sqlmodel Datetime
193+
"pydantic-ai-slim[retry]~=1.59.0",
191194
"pydub~=0.25.0",
192195
"pyjwt~=2.10.0",
193196
"pylance==0.16.0",
@@ -208,7 +211,7 @@ dependencies = [
208211
"typing_extensions~=4.14.0",
209212
"uuid-utils~=0.9.0",
210213
"uuid7~=0.1.0",
211-
"uvicorn[standard]~=0.28",
214+
"uvicorn[standard]~=0.40.0",
212215
"xmltodict~=0.14.0",
213216
]
214217
dynamic = ["version"]

services/api/src/owl/configs/oss.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ class EnvConfig(BaseSettings):
4343
code_executor_endpoint: str = "http://kopi:3000"
4444
docling_url: str = "http://docling:5001"
4545
docling_timeout_sec: Annotated[int, Field(gt=0, le=60 * 60)] = 20 * 60
46+
gotenberg_url: str = "http://gotenberg:3000"
47+
gotenberg_timeout_sec: Annotated[int, Field(gt=0, le=60 * 60)] = 5 * 60
4648
test_llm_api_base: str = "http://test-llm:6970/v1"
4749
# Configs
4850
embed_file_upload_max_bytes: int = 200 * 1024 * 1024 # 200MiB in bytes
@@ -59,7 +61,9 @@ class EnvConfig(BaseSettings):
5961
max_write_batch_size: int = 100
6062
max_file_cache_size: int = 20
6163
# PDF Loader configs
62-
fast_pdf_parsing: bool = True
64+
use_vlm_ocr: bool = True # Enable VLM OCR (otherwise use Docling OCR)
65+
# VLM model ID for OCR, only used when use_vlm_ocr is True.
66+
vlm_model_id: str = "openai/gpt-4o-mini"
6367
# LLM configs
6468
llm_timeout_sec: Annotated[int, Field(gt=0, le=60 * 60)] = 60
6569
embed_timeout_sec: Annotated[int, Field(gt=0, le=60 * 60)] = 60

services/api/src/owl/db/gen_executor.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1361,7 +1361,12 @@ async def _load_files(self, message: ChatThreadEntry) -> ChatEntry:
13611361
if isinstance(c, TextContent):
13621362
contents.append(c)
13631363
else:
1364-
data = await _load_uri_as_base64(c.uri)
1364+
data = await _load_uri_as_base64(
1365+
c.uri,
1366+
request=self.request,
1367+
organization=self.organization,
1368+
lm_engine=self.lm,
1369+
)
13651370
if getattr(self._col_map.get(c.column_name, None), "is_document_column", False):
13661371
# Document (data could be None)
13671372
replacements[c.column_name] = str(data)
@@ -1603,12 +1608,20 @@ def _resolve_image_output_type(image_bytes: bytes, mime_type: str | None) -> tup
16031608
return mime or "image/png", extension
16041609

16051610

1606-
async def _load_uri_as_base64(uri: str | None) -> str | AudioContent | ImageContent | None:
1611+
async def _load_uri_as_base64(
1612+
uri: str | None,
1613+
request: Request | None = None,
1614+
organization: OrganizationRead | None = None,
1615+
lm_engine: LMEngine | None = None,
1616+
) -> str | AudioContent | ImageContent | None:
16071617
"""
16081618
Loads a file from URI for LLM inference.
16091619
16101620
Args:
16111621
uri (str | None): The URI of the file.
1622+
request (Request | None): FastAPI request object for VLM OCR.
1623+
organization (OrganizationRead | None): Organization for VLM OCR.
1624+
lm_engine (LMEngine | None): LMEngine instance for VLM OCR.
16121625
16131626
Returns:
16141627
content (str | AudioContent | ImageContent): The file content.
@@ -1635,7 +1648,17 @@ async def _load_uri_as_base64(uri: str | None) -> str | AudioContent | ImageCont
16351648
try:
16361649
# Load as document
16371650
if extension in DOCUMENT_FILE_EXTENSIONS:
1638-
return await GeneralDocLoader().load_document(basename(uri), file_binary)
1651+
try:
1652+
return await GeneralDocLoader(
1653+
request_id=request.state.id,
1654+
lm_engine=lm_engine,
1655+
).load_document(basename(uri), file_binary)
1656+
except Exception as e:
1657+
logger.warning(
1658+
f'Failed to use VLM OCR for "{uri}": {repr(e)}, falling back to Docling.'
1659+
)
1660+
# Retry using Docling OCR as fallback
1661+
return await GeneralDocLoader().load_document(basename(uri), file_binary)
16391662
# Load as audio or image
16401663
else:
16411664
base64_data = base64.b64encode(file_binary).decode("utf-8")

0 commit comments

Comments
 (0)