Context-Engine/.env.example at test · mikahoy045/Context-Engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Qdrant connection
QDRANT_URL=http://localhost:6333
QDRANT_API_KEY=

# Multi-repo mode: 0=single-repo (default), 1=multi-repo
# Single-repo: All files go into one collection (COLLECTION_NAME)
# Multi-repo: Each subdirectory gets its own collection
MULTI_REPO_MODE=0

# Single unified collection for seamless cross-repo search (default: "codebase")
# Leave unset or use "codebase" for unified search across all your code
COLLECTION_NAME=codebase


# Embeddings
EMBEDDING_MODEL=BAAI/bge-base-en-v1.5
EMBEDDING_PROVIDER=fastembed
# Optional repo tag attached to each payload
REPO_NAME=workspace

# MCP servers (SSE)
FASTMCP_HOST=0.0.0.0
FASTMCP_PORT=8000            # search/store MCP (mcp-server-qdrant)
FASTMCP_INDEXER_PORT=8001    # companion indexer MCP (index/prune/list)
FASTMCP_SERVER_NAME=qdrant-mcp
# MCP_MAX_LOG_TAIL=4000      # Max chars for subprocess stdout/stderr tail (default: 4000)

# Transport: sse (default), http (streamable), or stdio
FASTMCP_TRANSPORT=sse


# Optional duplicate Streamable HTTP instances (run alongside SSE)
# Use these in docker-compose overrides to expose /mcp/ on separate ports.
FASTMCP_HTTP_TRANSPORT=http
FASTMCP_HTTP_PORT=8002
FASTMCP_HTTP_HEALTH_PORT=18002
FASTMCP_INDEXER_HTTP_PORT=8003
FASTMCP_INDEXER_HTTP_HEALTH_PORT=18003


# Optional: local cross-encoder reranker (ONNX)
# Set these to enable make rerank-local inside the container
RERANKER_ONNX_PATH=/work/models/model_qint8_avx512_vnni.onnx
RERANKER_TOKENIZER_PATH=/work/models/tokenizer.json

# Enable reranking in the indexer MCP search path
RERANKER_ENABLED=0
# Tuning knobs (effective when enabled)
RERANKER_TOPN=50
RERANKER_RETURN_M=12
RERANKER_TIMEOUT_MS=2000

# Safety: minimum rerank timeout floor (ms) to avoid cold-start timeouts
RERANK_TIMEOUT_FLOOR_MS=1000

# Optional warmups (disabled by default)
EMBEDDING_WARMUP=0
RERANK_WARMUP=0


# In-process execution (faster; falls back to subprocess on failure)
HYBRID_IN_PROCESS=1
RERANK_IN_PROCESS=1

# LLM query expansion (prefer local runtime)
# If OLLAMA_HOST is reachable, we use it by default with LLM_EXPAND_MODEL (e.g., glm4 or glm-4.5-air)
# Otherwise, if OPENAI_API_KEY is set, we fallback to OpenAI with the same model name
# Local LLM expansion via Ollama (mini model)
LLM_PROVIDER=ollama
OLLAMA_HOST=http://host.docker.internal:11434
LLM_EXPAND_MODEL=phi3:mini
LLM_EXPAND_MAX=4
# PRF defaults (enabled by default)
PRF_ENABLED=1

# Tree-sitter parsing (enable for more accurate symbols/scopes)

# Indexer scaling and exclusions
# Exclusions: defaults can be disabled or extended
# QDRANT_DEFAULT_EXCLUDES=0
# QDRANT_IGNORE_FILE=.qdrantignore
# QDRANT_EXCLUDES=tokenizer.json,*.onnx,/vendor
# Chunking + batching (tune for large repos)
# INDEX_CHUNK_LINES=120
# INDEX_CHUNK_OVERLAP=20
# INDEX_BATCH_SIZE=64
# INDEX_PROGRESS_EVERY=200

USE_TREE_SITTER=0


# ReFRAG mode (optional): compact gating + micro-chunking
# Enable to add a 64-dim mini vector for fast gating and use token-based micro-chunks
REFRAG_MODE=0
MINI_VECTOR_NAME=mini
MINI_VEC_DIM=64
MINI_VEC_SEED=1337
HYBRID_MINI_WEIGHT=0.5
# Micro-chunking controls (token-based)
INDEX_MICRO_CHUNKS=0
MICRO_CHUNK_TOKENS=16
MICRO_CHUNK_STRIDE=8
# Optional: gate-first using mini vectors to prefilter dense search
REFRAG_GATE_FIRST=0
REFRAG_CANDIDATES=200

# Output shaping for micro spans (defaults shown)
MICRO_OUT_MAX_SPANS=3
MICRO_MERGE_LINES=4
MICRO_BUDGET_TOKENS=512
MICRO_TOKENS_PER_LINE=32

# Decoder-path ReFRAG (feature-flagged; off by default)
REFRAG_DECODER=0
REFRAG_RUNTIME=llamacpp
REFRAG_ENCODER_MODEL=BAAI/bge-base-en-v1.5
REFRAG_PHI_PATH=/work/models/refrag_phi_768_to_dmodel.json
REFRAG_SENSE=heuristic

# Llama.cpp sidecar (optional)
# Docker CPU-only (stable): http://llamacpp:8080
# Native GPU-accelerated (fast): http://localhost:8081
LLAMACPP_URL=http://llamacpp:8080
REFRAG_DECODER_MODE=prompt  # prompt|soft

# GLM_API_BASE=https://api.z.ai/api/coding/paas/v4/
# GLM_MODEL=glm-4.6

# GPU Performance Toggle
# Set to 1 to use native GPU-accelerated server on localhost:8081
# Set to 0 to use Docker CPU-only server (default, stable)
USE_GPU_DECODER=0

REFRAG_SOFT_SCALE=1.0

# Llama.cpp runtime tuning
LLAMACPP_USE_GPU=0           # Set to 1 to enable Metal/CLBlast acceleration
# LLAMACPP_GPU_LAYERS=-1     # Override number of layers to offload (defaults to -1 when USE_GPU=1)
# LLAMACPP_GPU_SPLIT=         # Optional tensor split for multi-GPU setups
# LLAMACPP_THREADS=           # Override number of CPU threads
# LLAMACPP_CTX_SIZE=8192      # Context tokens; higher values need more VRAM
# LLAMACPP_EXTRA_ARGS=        # Additional flags passed verbatim to llama.cpp

# Operational safeguards and timeouts
# Limit explosion of micro-chunks on huge files (0 to disable)
MAX_MICRO_CHUNKS_PER_FILE=2000
# Qdrant request timeout (seconds)
QDRANT_TIMEOUT=20
# Memory collection auto-detection cache
MEMORY_AUTODETECT=1
MEMORY_COLLECTION_TTL_SECS=300


# Watcher-safe defaults (recommended)
# Applied to watcher via compose; uncomment to apply globally.
# Qdrant write/read timeout (seconds)
# QDRANT_TIMEOUT=60
# Cap per-file micro-chunks (keeps upserts small)
# MAX_MICRO_CHUNKS_PER_FILE=200
# Upsert batching + retries for large payloads
# INDEX_UPSERT_BATCH=128
# INDEX_UPSERT_RETRIES=5
# INDEX_UPSERT_BACKOFF=0.5
# Debounce file events to coalesce bursts
# WATCH_DEBOUNCE_SECS=1.5