hashgraph
diff --git a/‎hedera-guardian-ai-toolkit/.env.example‎
Lines changed: 180 additions & 0 deletions b/‎hedera-guardian-ai-toolkit/.env.example‎
Lines changed: 180 additions & 0 deletions
diff --git a/‎hedera-guardian-ai-toolkit/.gitignore‎
Lines changed: 71 additions & 0 deletions b/‎hedera-guardian-ai-toolkit/.gitignore‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎hedera-guardian-ai-toolkit/.pre-commit-config.yaml‎
Lines changed: 33 additions & 0 deletions b/‎hedera-guardian-ai-toolkit/.pre-commit-config.yaml‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎hedera-guardian-ai-toolkit/LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎hedera-guardian-ai-toolkit/LICENSE‎
Lines changed: 21 additions & 0 deletions
@@ -0,0 +1,180 @@
+# ========================================
+# Hedera Guardian AI Toolkit - Environment Configuration
+# ========================================
+# Copy this file to .env and customize as needed.
+# Each section corresponds to a package with its own configuration reference.
+
+# ========================================
+# MCP Server Configuration
+# See: packages/hedera_guardian_mcp_server/CONFIG.md
+# ========================================
+MCP_SERVER_HOST=0.0.0.0
+MCP_SERVER_PORT=9000
+MCP_SERVER_LOG_LEVEL=INFO
+# Enable detailed tool logging for debugging (arguments, responses)
+# MCP_SERVER_TOOL_LOGGING_ENABLED=false
+
+# MCP Server - Qdrant Connection
+QDRANT_HOST=localhost       # Docker: use "qdrant" (service name)
+QDRANT_PORT=6333
+# QDRANT_API_KEY=                              # Optional API key
+
+# MCP Server - Qdrant Collections
+QDRANT_SCHEMA_COLLECTION=schema_properties
+QDRANT_METHODOLOGY_COLLECTION=methodology_documents
+
+# Embedding Configuration (shared by MCP server)
+# Model name for generating embeddings (shared across all services)
+EMBEDDING_MODEL=aapot/bge-m3-onnx
+# Embedding provider type (bge_m3_onnx enables hybrid search with dense+sparse vectors)
+EMBEDDING_PROVIDER=bge_m3_onnx
+
+# Schema Builder output directory (used by MCP server for Excel generation)
+EXCEL_OUTPUT_DIR=./data/output
+
+# ========================================
+# Schema Ingestion Worker Configuration
+# See: packages/schema_ingestion_worker/CONFIG.md
+# ========================================
+# Qdrant URL for schema ingestion worker (usually same host as MCP server's QDRANT_HOST)
+SCHEMA_INGESTION_QDRANT_URL=http://localhost:6333
+# SCHEMA_INGESTION_QDRANT_API_KEY=                           # Optional API key
+# Collection name where schemas will be stored
+SCHEMA_INGESTION_QDRANT_COLLECTION_NAME=schema_properties
+# Embedding model and provider (BGE-M3 ONNX enables hybrid search with dense+sparse vectors)
+SCHEMA_INGESTION_EMBEDDING_MODEL_NAME=aapot/bge-m3-onnx
+SCHEMA_INGESTION_EMBEDDING_PROVIDER_TYPE=bge_m3_onnx
+# Directory containing JSON schemas to be ingested
+SCHEMA_INGESTION_INPUT_SCHEMAS_DIR=data/input/schemas
+# Docker: overridden in docker-compose.yml (/data/input/schemas)
+# Directory for ingestion output logs and reports
+SCHEMA_INGESTION_OUTPUT_DIR=data/output
+# Docker: overridden in docker-compose.yml (/data/output)
+# Ingestion mode: 'override' (replace all data, default) or 'append' (add to existing data)
+SCHEMA_INGESTION_MODE=override
+# Batch sizes for embedding and upsert operations
+SCHEMA_INGESTION_EMBEDDING_BATCH_SIZE=256
+SCHEMA_INGESTION_VECTOR_UPSERT_BATCH_SIZE=50
+# ONNX inference sub-batch size (bounds peak memory per session.run() call, default: 32)
+SCHEMA_INGESTION_ONNX_INFERENCE_BATCH_SIZE=32
+# Timeouts (seconds) for embedding and upsert operations
+SCHEMA_INGESTION_EMBEDDING_TIMEOUT=300
+SCHEMA_INGESTION_UPSERT_TIMEOUT=60
+# Logging level for ingestion worker (DEBUG, INFO, WARNING, ERROR)
+SCHEMA_INGESTION_LOG_LEVEL=INFO
+
+# ========================================
+# Document Ingestion Worker Configuration
+# See: packages/document_ingestion_worker/CONFIG.md
+# ========================================
+# Processes PDF documents into vector embeddings for semantic search.
+# Pipeline: PDF → Parse → Chunk → Embed → Qdrant
+#
+# Default profile: Balanced 16GB (optimized for 16GB RAM systems)
+# See packages/document_ingestion_worker/README.md for all settings
+
+# Qdrant settings
+DOCUMENT_INGESTION_QDRANT_URL=http://localhost:6333
+DOCUMENT_INGESTION_QDRANT_COLLECTION_NAME=methodology_documents
+# DOCUMENT_INGESTION_QDRANT_API_KEY=                   # Optional API key
+
+# Base data directory (all paths are derived from this)
+# Structure:
+#   data/input/documents/     - Place input PDFs here
+#   data/staged/documents/    - Intermediate files (parsed JSON, raw chunks, etc.)
+DOCUMENT_INGESTION_DATA_DIR=data
+
+# Ingestion mode: 'override' (default, replace all data) or 'append' (add to existing data)
+DOCUMENT_INGESTION_MODE=override
+
+# Processing parallelism (default: 1 for 16GB systems)
+# Increase for systems with more RAM (see packages/document_ingestion_worker/README.md for recommendations)
+DOCUMENT_INGESTION_MAX_PARALLEL_FILES=1
+
+# Subprocess timeout per document in seconds (default: 7200 = 2 hours)
+# Handles 200+ page methodology documents on CPU. Reduce for smaller batches.
+# DOCUMENT_INGESTION_SUBPROCESS_TIMEOUT_SECONDS=7200
+
+# Embedding settings (bge_m3_onnx enables hybrid search with dense+sparse vectors)
+DOCUMENT_INGESTION_EMBEDDING_MODEL_NAME=aapot/bge-m3-onnx
+DOCUMENT_INGESTION_EMBEDDING_PROVIDER_TYPE=bge_m3_onnx
+# Embedding batch size (default: 5 for 16GB, increase to 10+ for more RAM)
+DOCUMENT_INGESTION_EMBEDDING_BATCH_SIZE=5
+# Vector upsert batch size (default: 20)
+DOCUMENT_INGESTION_VECTOR_UPSERT_BATCH_SIZE=20
+
+# PDF Backend settings
+# PDF parsing backend: dlparse_v2 (default, faster C++ parser, no OCR support)
+# or dlparse_v1 (slower, supports Tesseract OCR for scanned documents)
+DOCUMENT_INGESTION_PDF_BACKEND=dlparse_v2
+# Page rendering scale (default: 2.0 for 16GB, set to 4.0 for High Quality profile)
+DOCUMENT_INGESTION_PDF_IMAGES_SCALE=2.0
+
+# PDF Parser settings
+# Enable OCR for scanned PDFs (default: false - digital PDFs don't need OCR)
+DOCUMENT_INGESTION_DO_OCR=false
+# OCR uses Tesseract CLI (only option - install via: choco/apt/brew install tesseract)
+# OCR language(s) - Tesseract format (ISO 639-3): ["eng"], ["eng", "deu"]
+DOCUMENT_INGESTION_OCR_LANG=["eng"]
+# Path to Tesseract executable (optional, uses system PATH if not set)
+# Windows example: C:/Program Files/Tesseract-OCR/tesseract.exe
+# DOCUMENT_INGESTION_TESSERACT_CMD=
+# Force OCR on full page even if text is detected
+DOCUMENT_INGESTION_FORCE_FULL_PAGE_OCR=false
+# Enable table structure extraction
+DOCUMENT_INGESTION_DO_TABLE_STRUCTURE=true
+# Table structure mode: 'accurate' (default, for complex tables) or 'fast' (simple tables)
+# DOCUMENT_INGESTION_TABLE_STRUCTURE_MODE=accurate
+# Enable cell matching for tables
+DOCUMENT_INGESTION_DO_CELL_MATCHING=true
+
+# Surya Formula Enrichment (better accuracy than Docling's built-in model)
+# Enabled by default - set to false to use Docling's formula model instead
+DOCUMENT_INGESTION_USE_SURYA_FORMULA_ENRICHMENT=true
+# Batch size for Surya (default: 2 for 16GB CPU, increase to 8-16 for GPU)
+DOCUMENT_INGESTION_SURYA_BATCH_SIZE=2
+# Pre-inference upscale (default: 1.5, improves subscript recognition)
+DOCUMENT_INGESTION_SURYA_UPSCALE_FACTOR=1.5
+# Bounding box expansion factors for formula detection (0.15 = 15% each side)
+# DOCUMENT_INGESTION_SURYA_EXPANSION_FACTOR_HORIZONTAL=0.15
+# DOCUMENT_INGESTION_SURYA_EXPANSION_FACTOR_VERTICAL=0.15
+
+# Legacy Docling formula enrichment pipeline
+# When USE_SURYA_FORMULA_ENRICHMENT=true (above), Surya replaces Docling's built-in
+# formula model. This flag controls Docling's own enrichment pipeline and should
+# remain true so that formula elements are still detected during layout analysis.
+DOCUMENT_INGESTION_DO_FORMULA_ENRICHMENT=true
+
+# Layout analysis model: heron-101 (default, best accuracy), heron (faster), egret-m/l/x
+# DOCUMENT_INGESTION_LAYOUT_MODEL=heron-101
+# Hardware acceleration: auto (default, detects CUDA > MPS > CPU), cuda, mps, cpu
+# DOCUMENT_INGESTION_ACCELERATOR_DEVICE=auto
+
+# Batch processing settings (default: 2 for 16GB CPU)
+DOCUMENT_INGESTION_LAYOUT_BATCH_SIZE=2
+DOCUMENT_INGESTION_OCR_BATCH_SIZE=2
+DOCUMENT_INGESTION_TABLE_BATCH_SIZE=2
+DOCUMENT_INGESTION_NUM_THREADS=2
+
+# Table postprocessing
+# Detect and merge tables split across page boundaries (default: true)
+# Recommended for methodology documents with large tables spanning multiple pages
+DOCUMENT_INGESTION_MERGE_SPLIT_TABLES=true
+# Keep each table as a separate chunk (default: true)
+# Prevents HybridChunker from merging multiple tables under the same heading
+DOCUMENT_INGESTION_ISOLATE_TABLE_CHUNKS=true
+
+# Chunker settings
+# Maximum tokens per chunk (default: 5000 for comprehensive context)
+DOCUMENT_INGESTION_CHUNK_MAX_TOKENS=5000
+# Overlapping tokens between chunks (default: 0)
+DOCUMENT_INGESTION_CHUNK_OVERLAP_TOKENS=0
+
+# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+DOCUMENT_INGESTION_LOG_LEVEL=INFO
+
+# Pipeline start point for resuming:
+#   'beginning' - Full pipeline (parse → chunk → embed)
+#   'parsed'    - Skip parsing, load from staged/<doc>/parsed/
+#   'chunked'   - Skip parsing+chunking, load from staged/<doc>/chunks/
+DOCUMENT_INGESTION_START_FROM=beginning
@@ -0,0 +1,71 @@
+# Poetry
+dist/
+build/
+*.egg-info/
+*.egg
+
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+.env.local
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+pip-log.txt
+pip-delete-this-directory.txt
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+*.sublime-project
+*.sublime-workspace
+
+# IDE settings
+.project
+.pydevproject
+.settings/
+
+# OS
+Thumbs.db
+.DS_Store
+*.log
+
+# Temporary files
+*.tmp
+*.temp
+.tmp/
+temp/
+
+# Local development
+.env
+.env.*.local
+*.local
+
+# Data/Cache directories
+data/
+
+# Node.js
+node_modules/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+package-lock.json
+test_output/
@@ -0,0 +1,33 @@
+# Pre-commit hooks for code quality and consistency
+# Install: poetry run pre-commit install
+# Run manually: poetry run pre-commit run --all-files
+
+repos:
+  # Ruff - Fast Python linter and formatter
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.15.0
+    hooks:
+      # Run the linter
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+        types_or: [python, pyi]
+
+      # Run the formatter
+      - id: ruff-format
+        types_or: [python, pyi]
+
+  # Basic file checks
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: trailing-whitespace
+        exclude: ^tests/.*snapshots/
+      - id: end-of-file-fixer
+        exclude: ^tests/.*snapshots/
+      - id: check-yaml
+      - id: check-toml
+      - id: check-added-large-files
+        args: [--maxkb=1000]
+      - id: check-merge-conflict
+      - id: check-case-conflict
+      - id: detect-private-key
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Hedera Hashgraph LLC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.