-
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdocproc.example.yaml
More file actions
42 lines (36 loc) · 1.19 KB
/
docproc.example.yaml
File metadata and controls
42 lines (36 loc) · 1.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# docproc v2 Configuration
# Copy to docproc.yaml and customize.
# Single database provider, multiple AI providers. Config is the source of truth.
# Database: choose ONE provider
database:
provider: qdrant # pgvector | qdrant | chroma | faiss | memory
connection_string: http://localhost:6333 # or DATABASE_URL env
collection_name: docproc
embed_dim: 1536
# path: ./data/chroma # for chroma/faiss persistent storage
# AI providers: list multiple, one is primary
ai_providers:
- provider: openai
default_model: gpt-4o
default_vision_model: gpt-4o
# - provider: anthropic
# default_model: claude-sonnet-4-20250514
# - provider: ollama
# base_url: http://localhost:11434
# default_model: llava
# - provider: litellm
# default_model: gpt-4o
primary_ai: openai
# RAG
rag:
backend: clara # clara (default) | embedding
top_k: 5
chunk_size: 512
namespace: default
# Document ingestion (PDF vision extraction)
ingest:
sanitize: true
drop_exact_duplicates: true
drop_boilerplate: true
use_vision: true # PDF: send embedded images to vision LLM; false = text only
use_llm_refine: true # LLM pass: clean markdown, LaTeX math, remove boilerplate before indexing