Qwen3-TTS-Openai-Fastapi/config.yaml at main · groxaxo/Qwen3-TTS-Openai-Fastapi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Qwen3-TTS server configuration
# Copy this file to ~/qwen3-tts/config.yaml (or point TTS_CONFIG to its location).
# The optimized backend reads this file at startup.

# Model to load on first request (must match a key in the `models` section below)
default_model: 0.6B-CustomVoice

# Available models.  Each entry maps a short name to a HuggingFace repo id (or
# a local path) and a model type ("customvoice" or "base").
#
# - customvoice  — predefined speakers (Vivian, Ryan, …); fastest for bulk TTS
# - base         — required for voice cloning (clone: prefix) and voice library
models:
  0.6B-CustomVoice:
    hf_id: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice
    type: customvoice
  0.6B-Base:
    hf_id: Qwen/Qwen3-TTS-12Hz-0.6B-Base
    type: base
  1.7B-CustomVoice:
    hf_id: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
    type: customvoice
  1.7B-Base:
    hf_id: Qwen/Qwen3-TTS-12Hz-1.7B-Base
    type: base

# Performance optimizations (only used with TTS_BACKEND=optimized)
optimization:
  # Attention implementation.  Options: flash_attention_2 | sdpa | eager
  # flash_attention_2 is fastest on modern GPUs; falls back to sdpa automatically.
  attention: flash_attention_2

  # torch.compile mode.  Options: default | reduce-overhead | max-autotune
  # max-autotune gives the best throughput after a ~75 s compilation on first start.
  compile_mode: max-autotune

  # Whether to enable torch.compile at all.  Set to false to skip compilation
  # (faster startup, lower throughput).
  use_compile: true

  # Whether to capture CUDA graphs (works on both CUDA and ROCm).
  # Required for max-autotune to reach peak performance.
  use_cuda_graphs: false   # set true after verifying compile works on your GPU

  # Fast codebook generation (bypasses HuggingFace generate() overhead).
  use_fast_codebook: true

  # Compile the codebook predictor as well (default true, recommended).
  compile_codebook_predictor: true

  # Streaming-specific parameters
  streaming:
    # Window size (frames) for the sliding streaming decoder.
    # AMD ROCm: use 72 (values 66/67/71 trigger a CUDA graph capture bug).
    # NVIDIA CUDA: 64, 72, or 80 all work.
    decode_window_frames: 80

    # Emit a PCM chunk every N decoded frames.  Lower = lower TTFB but slightly
    # higher RTF cost.  Recommended: 6 (good TTFB/RTF balance for voice agents).
    emit_every_frames: 6

# Server listen address and port
server:
  host: "0.0.0.0"
  port: 8880

# Built-in voice definitions (used by the /v1/voices endpoint when the optimized
# backend is active).  You can add/remove entries here freely.
voices:
  - name: Vivian
    language: Chinese
  - name: Serena
    language: Chinese
  - name: Uncle_Fu
    language: Chinese
  - name: Dylan
    language: Chinese
  - name: Eric
    language: Chinese
  - name: Ryan
    language: English
  - name: Aiden
    language: Japanese
  - name: Ono_Anna
    language: Korean
  - name: Sohee
    language: English