AI-Semantic-Cache/.env.example at main · wiledw/AI-Semantic-Cache · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# =============================================================================
# Boardy Semantic Cache Service - Environment Variables
# =============================================================================
# Copy this file to .env and fill in your actual values (especially OPENAI_API_KEY)
# All variables have defaults, but OPENAI_API_KEY is required for the service to work.

# -----------------------------------------------------------------------------
# Required Configuration
# -----------------------------------------------------------------------------

# OpenAI API Key (REQUIRED)
# Get your API key from: https://platform.openai.com/api-keys
OPENAI_API_KEY=

# -----------------------------------------------------------------------------
# OpenAI Model Configuration
# -----------------------------------------------------------------------------

# Chat model to use for LLM completions
# Options: gpt-4o-mini, gpt-4o, gpt-4o-search-preview, etc.
CHAT_MODEL=gpt-4o-mini

# Enable web search for time-sensitive queries using OpenAI search-preview models
# When enabled, uses search-preview models that can access web search
ENABLE_WEB_SEARCH=false

# -----------------------------------------------------------------------------
# Cache Configuration
# -----------------------------------------------------------------------------

# Similarity threshold for semantic cache matching (0.0 to 1.0)
# Higher values = stricter matching (fewer false positives, more cache misses)
# Lower values = more lenient matching (more cache hits, risk of false positives)
SIMILARITY_THRESHOLD=0.75

# Maximum number of LLM calls allowed before returning fallback response
# Set to 0 for unlimited (not recommended)
MAX_LLM_CALLS=100

# Cost per LLM call for cost estimation (in USD)
# Used for tracking and displaying estimated costs in the UI
LLM_COST_PER_CALL=0.01

# -----------------------------------------------------------------------------
# Redis Configuration
# -----------------------------------------------------------------------------

# Redis connection URL
# Format: redis://[host]:[port]/[database]
# For Docker Compose, use: redis://redis:6379/0
# For local Redis, use: redis://localhost:6379/0
REDIS_URL=redis://redis:6379/0

# -----------------------------------------------------------------------------
# TTL (Time-To-Live) Configuration
# -----------------------------------------------------------------------------

# Short TTL for time-sensitive queries (in seconds)
# Default: 600 (10 minutes)
SHORT_TTL_SECONDS=600

# Long TTL for evergreen queries (in seconds)
# Default: 86400 (24 hours)
LONG_TTL_SECONDS=86400

# TTL for cached embeddings (in seconds)
# Default: 604800 (7 days)
EMBEDDING_CACHE_TTL_SECONDS=604800

# Domain-specific max age limits (JSON object)
# Queries older than these limits will be invalidated even if TTL hasn't expired
# Format: {"domain": seconds}
# Example: {"weather": 3600, "news": 1800, "price": 1800, "score": 600}
MAX_AGE_BY_QUERY_TYPE={"weather": 3600, "news": 1800, "price": 1800, "score": 600}

# -----------------------------------------------------------------------------
# Weaviate Vector Database Configuration (Optional)
# -----------------------------------------------------------------------------

# Enable Weaviate vector database for faster semantic search
# When false, uses Redis-based linear search (slower but simpler)
USE_WEAVIATE=false

# Weaviate server URL
# For Docker Compose, use: http://weaviate:8080
# For local Weaviate, use: http://localhost:8080
WEAVIATE_URL=http://weaviate:8080

# Weaviate API key (leave empty if using anonymous access)
WEAVIATE_API_KEY=

# -----------------------------------------------------------------------------
# Performance & Batch Configuration
# -----------------------------------------------------------------------------

# Maximum number of embeddings per batch request to OpenAI
# OpenAI limit: 2048
MAX_BATCH_SIZE=2048

# Maximum number of concurrent LLM calls for parallel processing
# Adjust based on your OpenAI rate limits
MAX_PARALLEL_LLM_CALLS=10

# -----------------------------------------------------------------------------
# LLM Prompt Configuration
# -----------------------------------------------------------------------------

# System prompt for LLM completions
LLM_SYSTEM_PROMPT=You are a concise assistant. Answer briefly and factually.

# Fallback response when MAX_LLM_CALLS limit is reached
LLM_FALLBACK_RESPONSE=LLM call limit reached. Please try again later.

# -----------------------------------------------------------------------------
# Logging Configuration
# -----------------------------------------------------------------------------

# Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL
LOG_LEVEL=INFO

# Use JSON format for logs (true) or standard text format (false)
# JSON format is recommended for production and log aggregation tools
USE_JSON_LOGGING=true

# -----------------------------------------------------------------------------
# Circuit Breaker Configuration (Optional - for resilience features)
# -----------------------------------------------------------------------------

# Enable/disable circuit breakers for external service calls
# Circuit breakers prevent cascade failures by stopping requests when services fail
CIRCUIT_BREAKER_ENABLED=true

# Failure threshold (0.0 to 1.0) - circuit opens when failure rate exceeds this
# Example: 0.5 means circuit opens when 50% of requests fail
CIRCUIT_BREAKER_FAILURE_THRESHOLD=0.5

# Rolling time window in seconds for failure calculation
# Failures within this window are counted toward the failure threshold
CIRCUIT_BREAKER_TIME_WINDOW_SECONDS=60

# How long circuit stays OPEN before transitioning to HALF_OPEN (seconds)
# After this duration, circuit allows test requests to check if service recovered
CIRCUIT_BREAKER_OPEN_DURATION_SECONDS=30

# Success threshold (0.0 to 1.0) - circuit closes when success rate exceeds this in HALF_OPEN
# Example: 0.8 means circuit closes when 80% of test requests succeed
CIRCUIT_BREAKER_SUCCESS_THRESHOLD=0.8

# Maximum number of test calls allowed in HALF_OPEN state
# Limits how many requests are used to test if service has recovered
CIRCUIT_BREAKER_HALF_OPEN_MAX_CALLS=5

# -----------------------------------------------------------------------------
# Graceful Degradation Configuration (Optional - for high-load scenarios)
# -----------------------------------------------------------------------------

# Maximum number of concurrent requests allowed
# When exceeded, new requests may be rejected or queued
MAX_CONCURRENT_REQUESTS=1000

# Request timeout in seconds
# Requests exceeding this timeout will be cancelled
REQUEST_TIMEOUT_SECONDS=30