-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconfig.py
More file actions
320 lines (261 loc) · 11.3 KB
/
config.py
File metadata and controls
320 lines (261 loc) · 11.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
"""
Velloris Configuration
Centralized configuration for audio settings, model settings, and VAD parameters.
"""
import os
from pathlib import Path
from utils.device_utils import get_optimal_device, get_optimal_dtype, get_platform_info
class AudioConfig:
"""Audio input/output configuration."""
# Sample rates
INPUT_SAMPLE_RATE = 16000 # VAD standard, Whisper input
OUTPUT_SAMPLE_RATE = 24000 # PersonaPlex/Qwen3 output
# Buffer settings
BUFFER_DURATION = 2.0 # Transcribe every N seconds
CHUNK_SIZE = 512 # Audio frames per callback
OVERLAP_RATIO = 0.5 # Overlapping regions for smooth transcription
# Audio quality
MONO = True # Use mono audio for efficiency
BIT_DEPTH = 16 # 16-bit audio
# Device configuration
INPUT_DEVICE = None # None = default input device
OUTPUT_DEVICE = None # None = default output device
class ModelConfig:
"""Model configuration."""
# Project root directory
PROJECT_ROOT = Path(__file__).parent
# Model storage - project-local for offline/air-gapped use
MODELS_DIR = PROJECT_ROOT / "models"
OFFLINE_MODE = os.getenv("VELLORIS_OFFLINE", "false").lower() == "true"
# Model-specific directories
QWEN3_TTS_DIR = MODELS_DIR / "qwen3-tts"
PERSONAPLEX_DIR = MODELS_DIR / "personaplex"
SILERO_VAD_DIR = MODELS_DIR / "silero-vad"
WHISPER_DIR = MODELS_DIR / "whisper"
MLX_TTS_DIR = MODELS_DIR / "mlx-tts"
MLX_WHISPER_DIR = MODELS_DIR / "mlx-whisper"
MACECHO_DIR = MODELS_DIR / "macecho" # macOS only
MACECHO_SENSEVOICE_DIR = MODELS_DIR / "macecho-sensevoice" # ASR model
MACECHO_COSYVOICE_DIR = MODELS_DIR / "macecho-cosyvoice" # TTS model
# Hugging Face model identifiers
QWEN3_TTS_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
PERSONAPLEX_MODEL_ID = "nvidia/personaplex-7b-v1"
MLX_TTS_MODEL_ID = "mlx-community/Qwen3-TTS-12Hz-1.7B-VoiceDesign-bf16"
MLX_WHISPER_MODEL_ID = "mlx-community/whisper-large-v3-turbo-asr-fp16"
# Whisper STT settings
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
# Options: tiny, base, small, medium, large
# tiny = fastest, large = most accurate
# LLM settings (Ollama)
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3")
# Auto-detect Ollama host: if running in WSL2 with Ollama on Windows,
# use the WSL2 gateway IP to reach the Windows host
@staticmethod
def _detect_ollama_host():
explicit = os.getenv("OLLAMA_HOST")
if explicit:
return explicit
# Check if WSL2 and Ollama isn't available on localhost
if os.path.exists("/proc/sys/fs/binfmt_misc/WSLInterop"):
try:
with open("/etc/resolv.conf") as f:
for line in f:
if line.startswith("nameserver"):
wsl_host = line.split()[1]
# Try gateway IP instead (more reliable for WSL2)
import subprocess
result = subprocess.run(
["ip", "route", "show", "default"],
capture_output=True, text=True, timeout=2
)
if result.returncode == 0:
gateway = result.stdout.split()[2]
return f"http://{gateway}:11434"
return f"http://{wsl_host}:11434"
except Exception:
pass
return "http://localhost:11434"
OLLAMA_HOST = _detect_ollama_host()
# TTS settings
TTS_MODEL = "1.7B-CustomVoice" # Qwen3-TTS variant
TTS_LANGUAGE = "en" # Default language
# Device settings (auto-detect if not specified)
_requested_device = os.getenv("DEFAULT_DEVICE", "auto")
DEVICE = get_optimal_device(_requested_device)
_dtype_obj = get_optimal_dtype(DEVICE)
DTYPE = str(_dtype_obj).split(".")[-1] # "float32", "float16", or "bfloat16"
# Platform info
PLATFORM_INFO = get_platform_info()
# Voice paths
VOICES_DIR = PROJECT_ROOT / "voices"
VOICE_REFERENCE = os.getenv("VOICE_REFERENCE", str(VOICES_DIR / "reference.wav"))
@classmethod
def get_local_model_path(cls, model_type: str) -> Path:
"""Get local path for a model type."""
paths = {
"qwen3-tts": cls.QWEN3_TTS_DIR,
"personaplex": cls.PERSONAPLEX_DIR,
"silero-vad": cls.SILERO_VAD_DIR,
"whisper": cls.WHISPER_DIR,
"mlx-tts": cls.MLX_TTS_DIR,
"mlx-whisper": cls.MLX_WHISPER_DIR,
"macecho": cls.MACECHO_DIR,
"macecho-sensevoice": cls.MACECHO_SENSEVOICE_DIR,
"macecho-cosyvoice": cls.MACECHO_COSYVOICE_DIR,
}
return paths.get(model_type, cls.MODELS_DIR)
@classmethod
def is_model_downloaded(cls, model_type: str) -> bool:
"""Check if a model is downloaded locally."""
model_path = cls.get_local_model_path(model_type)
if not model_path.exists():
return False
# Check if directory has content
return any(model_path.iterdir()) if model_path.is_dir() else False
class VADConfig:
"""Voice Activity Detection configuration."""
# Silero VAD settings
THRESHOLD = 0.5 # VAD confidence threshold (0.0-1.0)
MIN_SPEECH_DURATION = 0.3 # Minimum speech duration in seconds
MIN_SILENCE_DURATION = 0.3 # Minimum silence duration in seconds
# Speech detection sensitivity
# Higher = more sensitive, may pick up background noise
SENSITIVITY = 0.5
# Interruption detection
ENABLE_BARGE_IN = True # Allow user to interrupt AI
INTERRUPT_THRESHOLD = 0.6 # How quickly user input interrupts AI
class ApplicationConfig:
"""Application-level configuration."""
# Mode settings
DEFAULT_MODE = os.getenv("DEFAULT_MODE", "realtime") # realtime, dubbing, creative
MODES = ["realtime", "dubbing", "creative"]
# Real-time mode settings (PersonaPlex/MacEcho end-to-end S2S)
REALTIME_VOICE = os.getenv("REALTIME_VOICE", "NATF2") # Default voice
REALTIME_PERSONA = os.getenv(
"REALTIME_PERSONA", "You are a helpful and friendly AI assistant."
)
REALTIME_STREAMING = True # Enable streaming for full-duplex
REALTIME_TIMEOUT = 30.0 # Timeout for user input (seconds)
REALTIME_SAMPLE_RATE = 24000 # Native sample rate for output
MACECHO_RESPONSE_TIMEOUT = float(
os.getenv("MACECHO_RESPONSE_TIMEOUT", "10.0")
) # Timeout for MacEcho response (seconds)
# Creative mode settings (Ollama + Qwen3-TTS)
CREATIVE_LLM = os.getenv("CREATIVE_LLM", "llama3")
CREATIVE_DEFAULT_EMOTION = os.getenv("CREATIVE_EMOTION", "")
CREATIVE_TIMEOUT = 120.0 # Timeout for LLM response (seconds)
# Dubbing mode settings (Qwen3-TTS/MLX-Audio high-fidelity)
DUBBING_CHUNK_SIZE = 256 # Process script in chunks
DUBBING_TIMEOUT = 60.0 # Timeout for TTS generation
DUBBING_LANGUAGE = os.getenv("DUBBING_LANGUAGE", "en") # Default language for TTS (en = English)
SUPPORTED_LANGUAGES = [
"english",
"chinese",
"japanese",
"korean",
"german",
"french",
"russian",
"portuguese",
"spanish",
"italian",
] # Languages supported by Qwen3-TTS and MLX-Audio
# Logging
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
LOG_FILE = None # None = stdout only
# Error handling
RETRY_ATTEMPTS = 3
RETRY_DELAY = 1.0 # seconds
# Performance
MAX_WORKERS = 4 # Thread pool size
ENABLE_PROFILING = False
class Config:
"""Combined configuration object."""
audio = AudioConfig()
model = ModelConfig()
vad = VADConfig()
app = ApplicationConfig()
@classmethod
def from_env(cls):
"""Load configuration from environment variables."""
# This is a placeholder for loading from .env files
# In production, use python-dotenv
pass
@classmethod
def validate(cls) -> bool:
"""Validate configuration settings."""
errors = []
# Check model names
valid_whisper = ["tiny", "base", "small", "medium", "large"]
if cls.model.WHISPER_MODEL not in valid_whisper:
errors.append(f"Invalid WHISPER_MODEL: {cls.model.WHISPER_MODEL}")
# Check device (DEVICE is already validated and resolved by get_optimal_device)
valid_devices = ["cuda", "cpu", "mps"]
if cls.model.DEVICE not in valid_devices:
errors.append(f"Invalid DEVICE: {cls.model.DEVICE}")
# Check VAD threshold
if not 0.0 <= cls.vad.THRESHOLD <= 1.0:
errors.append("VAD THRESHOLD must be between 0.0 and 1.0")
if errors:
print("Configuration validation errors:")
for error in errors:
print(f" - {error}")
return False
return True
@classmethod
def print_config(cls):
"""Print current configuration."""
print("\n=== Velloris Configuration ===")
# Platform info
print("\nPlatform:")
print(
f" OS: {cls.model.PLATFORM_INFO['os']} ({cls.model.PLATFORM_INFO['machine']})"
)
print(f" Python: {cls.model.PLATFORM_INFO['python_version']}")
print(f" CUDA Available: {cls.model.PLATFORM_INFO['cuda_available']}")
print(f" MPS Available: {cls.model.PLATFORM_INFO['mps_available']}")
print("\nAudio:")
print(f" Input SR: {cls.audio.INPUT_SAMPLE_RATE} Hz")
print(f" Output SR: {cls.audio.OUTPUT_SAMPLE_RATE} Hz")
print(f" Buffer: {cls.audio.BUFFER_DURATION}s")
print("\nModels:")
print(f" Models Dir: {cls.model.MODELS_DIR}")
print(f" Offline Mode: {cls.model.OFFLINE_MODE}")
print(f" Whisper: {cls.model.WHISPER_MODEL}")
print(f" LLM: {cls.model.OLLAMA_MODEL}")
print(f" Device: {cls.model.DEVICE}")
print(f" Dtype: {cls.model.DTYPE}")
print(f" Ollama: {cls.model.OLLAMA_HOST}")
print("\nModel Status:")
models_to_check = ["qwen3-tts", "personaplex", "silero-vad", "whisper"]
# Add MacEcho models on macOS
import sys
if sys.platform == "darwin":
models_to_check.append("macecho")
for model in models_to_check:
status = (
"[OK] Downloaded"
if cls.model.is_model_downloaded(model)
else "[X] Not found"
)
print(f" {model}: {status}")
print("\nVAD:")
print(f" Threshold: {cls.vad.THRESHOLD}")
print(f" Barge-in: {cls.vad.ENABLE_BARGE_IN}")
print("\nApplication:")
print(f" Default Mode: {cls.app.DEFAULT_MODE}")
print(f" Available Modes: {', '.join(cls.app.MODES)}")
print(f" Log Level: {cls.app.LOG_LEVEL}")
print("\nMode Settings:")
print(" Real-Time:")
print(f" Voice: {cls.app.REALTIME_VOICE}")
print(f" Persona: {cls.app.REALTIME_PERSONA[:50]}...")
print(f" Streaming: {cls.app.REALTIME_STREAMING}")
print(" Creative:")
print(f" LLM: {cls.app.CREATIVE_LLM}")
print(f" Emotion: {cls.app.CREATIVE_DEFAULT_EMOTION or 'None'}")
print(" Dubbing:")
print(f" Chunk Size: {cls.app.DUBBING_CHUNK_SIZE}\n")
if __name__ == "__main__":
Config.print_config()
Config.validate()