forked from QwenLM/Qwen3-TTS
-
Notifications
You must be signed in to change notification settings - Fork 50
Expand file tree
/
Copy pathconfig.yaml
More file actions
87 lines (75 loc) · 2.88 KB
/
config.yaml
File metadata and controls
87 lines (75 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Qwen3-TTS server configuration
# Copy this file to ~/qwen3-tts/config.yaml (or point TTS_CONFIG to its location).
# The optimized backend reads this file at startup.
# Model to load on first request (must match a key in the `models` section below)
default_model: 0.6B-CustomVoice
# Available models. Each entry maps a short name to a HuggingFace repo id (or
# a local path) and a model type ("customvoice" or "base").
#
# - customvoice — predefined speakers (Vivian, Ryan, …); fastest for bulk TTS
# - base — required for voice cloning (clone: prefix) and voice library
models:
0.6B-CustomVoice:
hf_id: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice
type: customvoice
0.6B-Base:
hf_id: Qwen/Qwen3-TTS-12Hz-0.6B-Base
type: base
1.7B-CustomVoice:
hf_id: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
type: customvoice
1.7B-Base:
hf_id: Qwen/Qwen3-TTS-12Hz-1.7B-Base
type: base
# Performance optimizations (only used with TTS_BACKEND=optimized)
optimization:
# Attention implementation. Options: flash_attention_2 | sdpa | eager
# flash_attention_2 is fastest on modern GPUs; falls back to sdpa automatically.
attention: flash_attention_2
# torch.compile mode. Options: default | reduce-overhead | max-autotune
# max-autotune gives the best throughput after a ~75 s compilation on first start.
compile_mode: max-autotune
# Whether to enable torch.compile at all. Set to false to skip compilation
# (faster startup, lower throughput).
use_compile: true
# Whether to capture CUDA graphs (works on both CUDA and ROCm).
# Required for max-autotune to reach peak performance.
use_cuda_graphs: false # set true after verifying compile works on your GPU
# Fast codebook generation (bypasses HuggingFace generate() overhead).
use_fast_codebook: true
# Compile the codebook predictor as well (default true, recommended).
compile_codebook_predictor: true
# Streaming-specific parameters
streaming:
# Window size (frames) for the sliding streaming decoder.
# AMD ROCm: use 72 (values 66/67/71 trigger a CUDA graph capture bug).
# NVIDIA CUDA: 64, 72, or 80 all work.
decode_window_frames: 80
# Emit a PCM chunk every N decoded frames. Lower = lower TTFB but slightly
# higher RTF cost. Recommended: 6 (good TTFB/RTF balance for voice agents).
emit_every_frames: 6
# Server listen address and port
server:
host: "0.0.0.0"
port: 8880
# Built-in voice definitions (used by the /v1/voices endpoint when the optimized
# backend is active). You can add/remove entries here freely.
voices:
- name: Vivian
language: Chinese
- name: Serena
language: Chinese
- name: Uncle_Fu
language: Chinese
- name: Dylan
language: Chinese
- name: Eric
language: Chinese
- name: Ryan
language: English
- name: Aiden
language: Japanese
- name: Ono_Anna
language: Korean
- name: Sohee
language: English