docproc/docproc.example.yaml at main · rithulkamesh/docproc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# docproc v2 Configuration
# Copy to docproc.yaml and customize.
# Single database provider, multiple AI providers. Config is the source of truth.

# Database: choose ONE provider
database:
  provider: qdrant  # pgvector | qdrant | chroma | faiss | memory
  connection_string: http://localhost:6333  # or DATABASE_URL env
  collection_name: docproc
  embed_dim: 1536
  # path: ./data/chroma  # for chroma/faiss persistent storage

# AI providers: list multiple, one is primary
ai_providers:
  - provider: openai
    default_model: gpt-4o
    default_vision_model: gpt-4o
  # - provider: anthropic
  #   default_model: claude-sonnet-4-20250514
  # - provider: ollama
  #   base_url: http://localhost:11434
  #   default_model: llava
  # - provider: litellm
  #   default_model: gpt-4o

primary_ai: openai

# RAG
rag:
  backend: clara  # clara (default) | embedding
  top_k: 5
  chunk_size: 512
  namespace: default

# Document ingestion (PDF vision extraction)
ingest:
  sanitize: true
  drop_exact_duplicates: true
  drop_boilerplate: true
  use_vision: true  # PDF: send embedded images to vision LLM; false = text only
  use_llm_refine: true  # LLM pass: clean markdown, LaTeX math, remove boilerplate before indexing