Velloris/config.py at main · randsley/Velloris · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
"""
Velloris Configuration

Centralized configuration for audio settings, model settings, and VAD parameters.
"""

import os
from pathlib import Path
from utils.device_utils import get_optimal_device, get_optimal_dtype, get_platform_info


class AudioConfig:
    """Audio input/output configuration."""

    # Sample rates
    INPUT_SAMPLE_RATE = 16000  # VAD standard, Whisper input
    OUTPUT_SAMPLE_RATE = 24000  # PersonaPlex/Qwen3 output

    # Buffer settings
    BUFFER_DURATION = 2.0  # Transcribe every N seconds
    CHUNK_SIZE = 512  # Audio frames per callback
    OVERLAP_RATIO = 0.5  # Overlapping regions for smooth transcription

    # Audio quality
    MONO = True  # Use mono audio for efficiency
    BIT_DEPTH = 16  # 16-bit audio

    # Device configuration
    INPUT_DEVICE = None  # None = default input device
    OUTPUT_DEVICE = None  # None = default output device


class ModelConfig:
    """Model configuration."""

    # Project root directory
    PROJECT_ROOT = Path(__file__).parent

    # Model storage - project-local for offline/air-gapped use
    MODELS_DIR = PROJECT_ROOT / "models"
    OFFLINE_MODE = os.getenv("VELLORIS_OFFLINE", "false").lower() == "true"

    # Model-specific directories
    QWEN3_TTS_DIR = MODELS_DIR / "qwen3-tts"
    PERSONAPLEX_DIR = MODELS_DIR / "personaplex"
    SILERO_VAD_DIR = MODELS_DIR / "silero-vad"
    WHISPER_DIR = MODELS_DIR / "whisper"
    MLX_TTS_DIR = MODELS_DIR / "mlx-tts"
    MLX_WHISPER_DIR = MODELS_DIR / "mlx-whisper"
    MACECHO_DIR = MODELS_DIR / "macecho"  # macOS only
    MACECHO_SENSEVOICE_DIR = MODELS_DIR / "macecho-sensevoice"  # ASR model
    MACECHO_COSYVOICE_DIR = MODELS_DIR / "macecho-cosyvoice"  # TTS model

    # Hugging Face model identifiers
    QWEN3_TTS_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
    PERSONAPLEX_MODEL_ID = "nvidia/personaplex-7b-v1"
    MLX_TTS_MODEL_ID = "mlx-community/Qwen3-TTS-12Hz-1.7B-VoiceDesign-bf16"
    MLX_WHISPER_MODEL_ID = "mlx-community/whisper-large-v3-turbo-asr-fp16"

    # Whisper STT settings
    WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
    # Options: tiny, base, small, medium, large
    # tiny = fastest, large = most accurate

    # LLM settings (Ollama)
    OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3")

    # Auto-detect Ollama host: if running in WSL2 with Ollama on Windows,
    # use the WSL2 gateway IP to reach the Windows host
    @staticmethod
    def _detect_ollama_host():
        explicit = os.getenv("OLLAMA_HOST")
        if explicit:
            return explicit
        # Check if WSL2 and Ollama isn't available on localhost
        if os.path.exists("/proc/sys/fs/binfmt_misc/WSLInterop"):
            try:
                with open("/etc/resolv.conf") as f:
                    for line in f:
                        if line.startswith("nameserver"):
                            wsl_host = line.split()[1]
                            # Try gateway IP instead (more reliable for WSL2)
                            import subprocess
                            result = subprocess.run(
                                ["ip", "route", "show", "default"],
                                capture_output=True, text=True, timeout=2
                            )
                            if result.returncode == 0:
                                gateway = result.stdout.split()[2]
                                return f"http://{gateway}:11434"
                            return f"http://{wsl_host}:11434"
            except Exception:
                pass
        return "http://localhost:11434"

    OLLAMA_HOST = _detect_ollama_host()

    # TTS settings
    TTS_MODEL = "1.7B-CustomVoice"  # Qwen3-TTS variant
    TTS_LANGUAGE = "en"  # Default language

    # Device settings (auto-detect if not specified)
    _requested_device = os.getenv("DEFAULT_DEVICE", "auto")
    DEVICE = get_optimal_device(_requested_device)
    _dtype_obj = get_optimal_dtype(DEVICE)
    DTYPE = str(_dtype_obj).split(".")[-1]  # "float32", "float16", or "bfloat16"

    # Platform info
    PLATFORM_INFO = get_platform_info()

    # Voice paths
    VOICES_DIR = PROJECT_ROOT / "voices"
    VOICE_REFERENCE = os.getenv("VOICE_REFERENCE", str(VOICES_DIR / "reference.wav"))

    @classmethod
    def get_local_model_path(cls, model_type: str) -> Path:
        """Get local path for a model type."""
        paths = {
            "qwen3-tts": cls.QWEN3_TTS_DIR,
            "personaplex": cls.PERSONAPLEX_DIR,
            "silero-vad": cls.SILERO_VAD_DIR,
            "whisper": cls.WHISPER_DIR,
            "mlx-tts": cls.MLX_TTS_DIR,
            "mlx-whisper": cls.MLX_WHISPER_DIR,
            "macecho": cls.MACECHO_DIR,
            "macecho-sensevoice": cls.MACECHO_SENSEVOICE_DIR,
            "macecho-cosyvoice": cls.MACECHO_COSYVOICE_DIR,
        }
        return paths.get(model_type, cls.MODELS_DIR)

    @classmethod
    def is_model_downloaded(cls, model_type: str) -> bool:
        """Check if a model is downloaded locally."""
        model_path = cls.get_local_model_path(model_type)
        if not model_path.exists():
            return False
        # Check if directory has content
        return any(model_path.iterdir()) if model_path.is_dir() else False


class VADConfig:
    """Voice Activity Detection configuration."""

    # Silero VAD settings
    THRESHOLD = 0.5  # VAD confidence threshold (0.0-1.0)
    MIN_SPEECH_DURATION = 0.3  # Minimum speech duration in seconds
    MIN_SILENCE_DURATION = 0.3  # Minimum silence duration in seconds

    # Speech detection sensitivity
    # Higher = more sensitive, may pick up background noise
    SENSITIVITY = 0.5

    # Interruption detection
    ENABLE_BARGE_IN = True  # Allow user to interrupt AI
    INTERRUPT_THRESHOLD = 0.6  # How quickly user input interrupts AI


class ApplicationConfig:
    """Application-level configuration."""

    # Mode settings
    DEFAULT_MODE = os.getenv("DEFAULT_MODE", "realtime")  # realtime, dubbing, creative
    MODES = ["realtime", "dubbing", "creative"]

    # Real-time mode settings (PersonaPlex/MacEcho end-to-end S2S)
    REALTIME_VOICE = os.getenv("REALTIME_VOICE", "NATF2")  # Default voice
    REALTIME_PERSONA = os.getenv(
        "REALTIME_PERSONA", "You are a helpful and friendly AI assistant."
    )
    REALTIME_STREAMING = True  # Enable streaming for full-duplex
    REALTIME_TIMEOUT = 30.0  # Timeout for user input (seconds)
    REALTIME_SAMPLE_RATE = 24000  # Native sample rate for output
    MACECHO_RESPONSE_TIMEOUT = float(
        os.getenv("MACECHO_RESPONSE_TIMEOUT", "10.0")
    )  # Timeout for MacEcho response (seconds)

    # Creative mode settings (Ollama + Qwen3-TTS)
    CREATIVE_LLM = os.getenv("CREATIVE_LLM", "llama3")
    CREATIVE_DEFAULT_EMOTION = os.getenv("CREATIVE_EMOTION", "")
    CREATIVE_TIMEOUT = 120.0  # Timeout for LLM response (seconds)

    # Dubbing mode settings (Qwen3-TTS/MLX-Audio high-fidelity)
    DUBBING_CHUNK_SIZE = 256  # Process script in chunks
    DUBBING_TIMEOUT = 60.0  # Timeout for TTS generation
    DUBBING_LANGUAGE = os.getenv("DUBBING_LANGUAGE", "en")  # Default language for TTS (en = English)
    SUPPORTED_LANGUAGES = [
        "english",
        "chinese",
        "japanese",
        "korean",
        "german",
        "french",
        "russian",
        "portuguese",
        "spanish",
        "italian",
    ]  # Languages supported by Qwen3-TTS and MLX-Audio

    # Logging
    LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
    LOG_FILE = None  # None = stdout only

    # Error handling
    RETRY_ATTEMPTS = 3
    RETRY_DELAY = 1.0  # seconds

    # Performance
    MAX_WORKERS = 4  # Thread pool size
    ENABLE_PROFILING = False


class Config:
    """Combined configuration object."""

    audio = AudioConfig()
    model = ModelConfig()
    vad = VADConfig()
    app = ApplicationConfig()

    @classmethod
    def from_env(cls):
        """Load configuration from environment variables."""
        # This is a placeholder for loading from .env files
        # In production, use python-dotenv
        pass

    @classmethod
    def validate(cls) -> bool:
        """Validate configuration settings."""
        errors = []

        # Check model names
        valid_whisper = ["tiny", "base", "small", "medium", "large"]
        if cls.model.WHISPER_MODEL not in valid_whisper:
            errors.append(f"Invalid WHISPER_MODEL: {cls.model.WHISPER_MODEL}")

        # Check device (DEVICE is already validated and resolved by get_optimal_device)
        valid_devices = ["cuda", "cpu", "mps"]
        if cls.model.DEVICE not in valid_devices:
            errors.append(f"Invalid DEVICE: {cls.model.DEVICE}")

        # Check VAD threshold
        if not 0.0 <= cls.vad.THRESHOLD <= 1.0:
            errors.append("VAD THRESHOLD must be between 0.0 and 1.0")

        if errors:
            print("Configuration validation errors:")
            for error in errors:
                print(f"  - {error}")
            return False

        return True

    @classmethod
    def print_config(cls):
        """Print current configuration."""
        print("\n=== Velloris Configuration ===")

        # Platform info
        print("\nPlatform:")
        print(
            f"  OS: {cls.model.PLATFORM_INFO['os']} ({cls.model.PLATFORM_INFO['machine']})"
        )
        print(f"  Python: {cls.model.PLATFORM_INFO['python_version']}")
        print(f"  CUDA Available: {cls.model.PLATFORM_INFO['cuda_available']}")
        print(f"  MPS Available: {cls.model.PLATFORM_INFO['mps_available']}")

        print("\nAudio:")
        print(f"  Input SR: {cls.audio.INPUT_SAMPLE_RATE} Hz")
        print(f"  Output SR: {cls.audio.OUTPUT_SAMPLE_RATE} Hz")
        print(f"  Buffer: {cls.audio.BUFFER_DURATION}s")

        print("\nModels:")
        print(f"  Models Dir: {cls.model.MODELS_DIR}")
        print(f"  Offline Mode: {cls.model.OFFLINE_MODE}")
        print(f"  Whisper: {cls.model.WHISPER_MODEL}")
        print(f"  LLM: {cls.model.OLLAMA_MODEL}")
        print(f"  Device: {cls.model.DEVICE}")
        print(f"  Dtype: {cls.model.DTYPE}")
        print(f"  Ollama: {cls.model.OLLAMA_HOST}")

        print("\nModel Status:")
        models_to_check = ["qwen3-tts", "personaplex", "silero-vad", "whisper"]
        # Add MacEcho models on macOS
        import sys

        if sys.platform == "darwin":
            models_to_check.append("macecho")
        for model in models_to_check:
            status = (
                "[OK] Downloaded"
                if cls.model.is_model_downloaded(model)
                else "[X] Not found"
            )
            print(f"  {model}: {status}")

        print("\nVAD:")
        print(f"  Threshold: {cls.vad.THRESHOLD}")
        print(f"  Barge-in: {cls.vad.ENABLE_BARGE_IN}")

        print("\nApplication:")
        print(f"  Default Mode: {cls.app.DEFAULT_MODE}")
        print(f"  Available Modes: {', '.join(cls.app.MODES)}")
        print(f"  Log Level: {cls.app.LOG_LEVEL}")

        print("\nMode Settings:")
        print("  Real-Time:")
        print(f"    Voice: {cls.app.REALTIME_VOICE}")
        print(f"    Persona: {cls.app.REALTIME_PERSONA[:50]}...")
        print(f"    Streaming: {cls.app.REALTIME_STREAMING}")
        print("  Creative:")
        print(f"    LLM: {cls.app.CREATIVE_LLM}")
        print(f"    Emotion: {cls.app.CREATIVE_DEFAULT_EMOTION or 'None'}")
        print("  Dubbing:")
        print(f"    Chunk Size: {cls.app.DUBBING_CHUNK_SIZE}\n")


if __name__ == "__main__":
    Config.print_config()
    Config.validate()