|
| 1 | +# ======================================== |
| 2 | +# Hedera Guardian AI Toolkit - Environment Configuration |
| 3 | +# ======================================== |
| 4 | +# Copy this file to .env and customize as needed. |
| 5 | +# Each section corresponds to a package with its own configuration reference. |
| 6 | + |
| 7 | +# ======================================== |
| 8 | +# MCP Server Configuration |
| 9 | +# See: packages/hedera_guardian_mcp_server/CONFIG.md |
| 10 | +# ======================================== |
| 11 | +MCP_SERVER_HOST=0.0.0.0 |
| 12 | +MCP_SERVER_PORT=9000 |
| 13 | +MCP_SERVER_LOG_LEVEL=INFO |
| 14 | +# Enable detailed tool logging for debugging (arguments, responses) |
| 15 | +# MCP_SERVER_TOOL_LOGGING_ENABLED=false |
| 16 | + |
| 17 | +# MCP Server - Qdrant Connection |
| 18 | +QDRANT_HOST=localhost # Docker: use "qdrant" (service name) |
| 19 | +QDRANT_PORT=6333 |
| 20 | +# QDRANT_API_KEY= # Optional API key |
| 21 | + |
| 22 | +# MCP Server - Qdrant Collections |
| 23 | +QDRANT_SCHEMA_COLLECTION=schema_properties |
| 24 | +QDRANT_METHODOLOGY_COLLECTION=methodology_documents |
| 25 | + |
| 26 | +# Embedding Configuration (shared by MCP server) |
| 27 | +# Model name for generating embeddings (shared across all services) |
| 28 | +EMBEDDING_MODEL=aapot/bge-m3-onnx |
| 29 | +# Embedding provider type (bge_m3_onnx enables hybrid search with dense+sparse vectors) |
| 30 | +EMBEDDING_PROVIDER=bge_m3_onnx |
| 31 | + |
| 32 | +# Schema Builder output directory (used by MCP server for Excel generation) |
| 33 | +EXCEL_OUTPUT_DIR=./data/output |
| 34 | + |
| 35 | +# ======================================== |
| 36 | +# Schema Ingestion Worker Configuration |
| 37 | +# See: packages/schema_ingestion_worker/CONFIG.md |
| 38 | +# ======================================== |
| 39 | +# Qdrant URL for schema ingestion worker (usually same host as MCP server's QDRANT_HOST) |
| 40 | +SCHEMA_INGESTION_QDRANT_URL=http://localhost:6333 |
| 41 | +# SCHEMA_INGESTION_QDRANT_API_KEY= # Optional API key |
| 42 | +# Collection name where schemas will be stored |
| 43 | +SCHEMA_INGESTION_QDRANT_COLLECTION_NAME=schema_properties |
| 44 | +# Embedding model and provider (BGE-M3 ONNX enables hybrid search with dense+sparse vectors) |
| 45 | +SCHEMA_INGESTION_EMBEDDING_MODEL_NAME=aapot/bge-m3-onnx |
| 46 | +SCHEMA_INGESTION_EMBEDDING_PROVIDER_TYPE=bge_m3_onnx |
| 47 | +# Directory containing JSON schemas to be ingested |
| 48 | +SCHEMA_INGESTION_INPUT_SCHEMAS_DIR=data/input/schemas |
| 49 | +# Docker: overridden in docker-compose.yml (/data/input/schemas) |
| 50 | +# Directory for ingestion output logs and reports |
| 51 | +SCHEMA_INGESTION_OUTPUT_DIR=data/output |
| 52 | +# Docker: overridden in docker-compose.yml (/data/output) |
| 53 | +# Ingestion mode: 'override' (replace all data, default) or 'append' (add to existing data) |
| 54 | +SCHEMA_INGESTION_MODE=override |
| 55 | +# Batch sizes for embedding and upsert operations |
| 56 | +SCHEMA_INGESTION_EMBEDDING_BATCH_SIZE=256 |
| 57 | +SCHEMA_INGESTION_VECTOR_UPSERT_BATCH_SIZE=50 |
| 58 | +# ONNX inference sub-batch size (bounds peak memory per session.run() call, default: 32) |
| 59 | +SCHEMA_INGESTION_ONNX_INFERENCE_BATCH_SIZE=32 |
| 60 | +# Timeouts (seconds) for embedding and upsert operations |
| 61 | +SCHEMA_INGESTION_EMBEDDING_TIMEOUT=300 |
| 62 | +SCHEMA_INGESTION_UPSERT_TIMEOUT=60 |
| 63 | +# Logging level for ingestion worker (DEBUG, INFO, WARNING, ERROR) |
| 64 | +SCHEMA_INGESTION_LOG_LEVEL=INFO |
| 65 | + |
| 66 | +# ======================================== |
| 67 | +# Document Ingestion Worker Configuration |
| 68 | +# See: packages/document_ingestion_worker/CONFIG.md |
| 69 | +# ======================================== |
| 70 | +# Processes PDF documents into vector embeddings for semantic search. |
| 71 | +# Pipeline: PDF → Parse → Chunk → Embed → Qdrant |
| 72 | +# |
| 73 | +# Default profile: Balanced 16GB (optimized for 16GB RAM systems) |
| 74 | +# See packages/document_ingestion_worker/README.md for all settings |
| 75 | + |
| 76 | +# Qdrant settings |
| 77 | +DOCUMENT_INGESTION_QDRANT_URL=http://localhost:6333 |
| 78 | +DOCUMENT_INGESTION_QDRANT_COLLECTION_NAME=methodology_documents |
| 79 | +# DOCUMENT_INGESTION_QDRANT_API_KEY= # Optional API key |
| 80 | + |
| 81 | +# Base data directory (all paths are derived from this) |
| 82 | +# Structure: |
| 83 | +# data/input/documents/ - Place input PDFs here |
| 84 | +# data/staged/documents/ - Intermediate files (parsed JSON, raw chunks, etc.) |
| 85 | +DOCUMENT_INGESTION_DATA_DIR=data |
| 86 | + |
| 87 | +# Ingestion mode: 'override' (default, replace all data) or 'append' (add to existing data) |
| 88 | +DOCUMENT_INGESTION_MODE=override |
| 89 | + |
| 90 | +# Processing parallelism (default: 1 for 16GB systems) |
| 91 | +# Increase for systems with more RAM (see packages/document_ingestion_worker/README.md for recommendations) |
| 92 | +DOCUMENT_INGESTION_MAX_PARALLEL_FILES=1 |
| 93 | + |
| 94 | +# Subprocess timeout per document in seconds (default: 7200 = 2 hours) |
| 95 | +# Handles 200+ page methodology documents on CPU. Reduce for smaller batches. |
| 96 | +# DOCUMENT_INGESTION_SUBPROCESS_TIMEOUT_SECONDS=7200 |
| 97 | + |
| 98 | +# Embedding settings (bge_m3_onnx enables hybrid search with dense+sparse vectors) |
| 99 | +DOCUMENT_INGESTION_EMBEDDING_MODEL_NAME=aapot/bge-m3-onnx |
| 100 | +DOCUMENT_INGESTION_EMBEDDING_PROVIDER_TYPE=bge_m3_onnx |
| 101 | +# Embedding batch size (default: 5 for 16GB, increase to 10+ for more RAM) |
| 102 | +DOCUMENT_INGESTION_EMBEDDING_BATCH_SIZE=5 |
| 103 | +# Vector upsert batch size (default: 20) |
| 104 | +DOCUMENT_INGESTION_VECTOR_UPSERT_BATCH_SIZE=20 |
| 105 | + |
| 106 | +# PDF Backend settings |
| 107 | +# PDF parsing backend: dlparse_v2 (default, faster C++ parser, no OCR support) |
| 108 | +# or dlparse_v1 (slower, supports Tesseract OCR for scanned documents) |
| 109 | +DOCUMENT_INGESTION_PDF_BACKEND=dlparse_v2 |
| 110 | +# Page rendering scale (default: 2.0 for 16GB, set to 4.0 for High Quality profile) |
| 111 | +DOCUMENT_INGESTION_PDF_IMAGES_SCALE=2.0 |
| 112 | + |
| 113 | +# PDF Parser settings |
| 114 | +# Enable OCR for scanned PDFs (default: false - digital PDFs don't need OCR) |
| 115 | +DOCUMENT_INGESTION_DO_OCR=false |
| 116 | +# OCR uses Tesseract CLI (only option - install via: choco/apt/brew install tesseract) |
| 117 | +# OCR language(s) - Tesseract format (ISO 639-3): ["eng"], ["eng", "deu"] |
| 118 | +DOCUMENT_INGESTION_OCR_LANG=["eng"] |
| 119 | +# Path to Tesseract executable (optional, uses system PATH if not set) |
| 120 | +# Windows example: C:/Program Files/Tesseract-OCR/tesseract.exe |
| 121 | +# DOCUMENT_INGESTION_TESSERACT_CMD= |
| 122 | +# Force OCR on full page even if text is detected |
| 123 | +DOCUMENT_INGESTION_FORCE_FULL_PAGE_OCR=false |
| 124 | +# Enable table structure extraction |
| 125 | +DOCUMENT_INGESTION_DO_TABLE_STRUCTURE=true |
| 126 | +# Table structure mode: 'accurate' (default, for complex tables) or 'fast' (simple tables) |
| 127 | +# DOCUMENT_INGESTION_TABLE_STRUCTURE_MODE=accurate |
| 128 | +# Enable cell matching for tables |
| 129 | +DOCUMENT_INGESTION_DO_CELL_MATCHING=true |
| 130 | + |
| 131 | +# Surya Formula Enrichment (better accuracy than Docling's built-in model) |
| 132 | +# Enabled by default - set to false to use Docling's formula model instead |
| 133 | +DOCUMENT_INGESTION_USE_SURYA_FORMULA_ENRICHMENT=true |
| 134 | +# Batch size for Surya (default: 2 for 16GB CPU, increase to 8-16 for GPU) |
| 135 | +DOCUMENT_INGESTION_SURYA_BATCH_SIZE=2 |
| 136 | +# Pre-inference upscale (default: 1.5, improves subscript recognition) |
| 137 | +DOCUMENT_INGESTION_SURYA_UPSCALE_FACTOR=1.5 |
| 138 | +# Bounding box expansion factors for formula detection (0.15 = 15% each side) |
| 139 | +# DOCUMENT_INGESTION_SURYA_EXPANSION_FACTOR_HORIZONTAL=0.15 |
| 140 | +# DOCUMENT_INGESTION_SURYA_EXPANSION_FACTOR_VERTICAL=0.15 |
| 141 | + |
| 142 | +# Legacy Docling formula enrichment pipeline |
| 143 | +# When USE_SURYA_FORMULA_ENRICHMENT=true (above), Surya replaces Docling's built-in |
| 144 | +# formula model. This flag controls Docling's own enrichment pipeline and should |
| 145 | +# remain true so that formula elements are still detected during layout analysis. |
| 146 | +DOCUMENT_INGESTION_DO_FORMULA_ENRICHMENT=true |
| 147 | + |
| 148 | +# Layout analysis model: heron-101 (default, best accuracy), heron (faster), egret-m/l/x |
| 149 | +# DOCUMENT_INGESTION_LAYOUT_MODEL=heron-101 |
| 150 | +# Hardware acceleration: auto (default, detects CUDA > MPS > CPU), cuda, mps, cpu |
| 151 | +# DOCUMENT_INGESTION_ACCELERATOR_DEVICE=auto |
| 152 | + |
| 153 | +# Batch processing settings (default: 2 for 16GB CPU) |
| 154 | +DOCUMENT_INGESTION_LAYOUT_BATCH_SIZE=2 |
| 155 | +DOCUMENT_INGESTION_OCR_BATCH_SIZE=2 |
| 156 | +DOCUMENT_INGESTION_TABLE_BATCH_SIZE=2 |
| 157 | +DOCUMENT_INGESTION_NUM_THREADS=2 |
| 158 | + |
| 159 | +# Table postprocessing |
| 160 | +# Detect and merge tables split across page boundaries (default: true) |
| 161 | +# Recommended for methodology documents with large tables spanning multiple pages |
| 162 | +DOCUMENT_INGESTION_MERGE_SPLIT_TABLES=true |
| 163 | +# Keep each table as a separate chunk (default: true) |
| 164 | +# Prevents HybridChunker from merging multiple tables under the same heading |
| 165 | +DOCUMENT_INGESTION_ISOLATE_TABLE_CHUNKS=true |
| 166 | + |
| 167 | +# Chunker settings |
| 168 | +# Maximum tokens per chunk (default: 5000 for comprehensive context) |
| 169 | +DOCUMENT_INGESTION_CHUNK_MAX_TOKENS=5000 |
| 170 | +# Overlapping tokens between chunks (default: 0) |
| 171 | +DOCUMENT_INGESTION_CHUNK_OVERLAP_TOKENS=0 |
| 172 | + |
| 173 | +# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) |
| 174 | +DOCUMENT_INGESTION_LOG_LEVEL=INFO |
| 175 | + |
| 176 | +# Pipeline start point for resuming: |
| 177 | +# 'beginning' - Full pipeline (parse → chunk → embed) |
| 178 | +# 'parsed' - Skip parsing, load from staged/<doc>/parsed/ |
| 179 | +# 'chunked' - Skip parsing+chunking, load from staged/<doc>/chunks/ |
| 180 | +DOCUMENT_INGESTION_START_FROM=beginning |
0 commit comments