feat: SparkTTS support half mode

zhzLuke96 · zhzLuke96 · commit 39d857ef0bb3 · 2025-07-05T02:37:46.000+08:00
- SparkTTS 支持半精度，但是只允许 bf16 形式运行
- 增加 bf16 启动项
diff --git a/modules/core/models/tts/SparkTTS/SparkTTS.py b/modules/core/models/tts/SparkTTS/SparkTTS.py
@@ -19,7 +19,7 @@
 
 import numpy.typing as npt
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2ForCausalLM
 
 from modules.repos_static.spark_tts.sparktts.models.audio_tokenizer import (
     BiCodecTokenizer,
@@ -37,7 +37,12 @@ class SparkTTS:
     Spark-TTS for text-to-speech generation.
     """
 
-    def __init__(self, model_dir: Path, device: torch.device = torch.device("cuda:0")):
+    def __init__(
+        self,
+        model_dir: Path,
+        device: torch.device = torch.device("cuda:0"),
+        dtype: torch.dtype = torch.float32,
+    ):
         """
         Initializes the SparkTTS model with the provided configurations and device.
 
@@ -46,6 +51,7 @@ def __init__(self, model_dir: Path, device: torch.device = torch.device("cuda:0"
             device (torch.device): The device (CPU/GPU) to run the model on.
         """
         self.device = device
+        self.dtype = dtype
         self.model_dir = model_dir
         self.configs = load_config(f"{model_dir}/config.yaml")
         self.sample_rate = self.configs["sample_rate"]
@@ -54,9 +60,12 @@ def __init__(self, model_dir: Path, device: torch.device = torch.device("cuda:0"
     def _initialize_inference(self):
         """Initializes the tokenizer, model, and audio tokenizer for inference."""
         self.tokenizer = AutoTokenizer.from_pretrained(f"{self.model_dir}/LLM")
-        self.model = AutoModelForCausalLM.from_pretrained(f"{self.model_dir}/LLM")
+        self.model: Qwen2ForCausalLM = AutoModelForCausalLM.from_pretrained(
+            f"{self.model_dir}/LLM"
+        )
         self.audio_tokenizer = BiCodecTokenizer(self.model_dir, device=self.device)
-        self.model.to(self.device)
+        self.model.to(device=self.device, dtype=self.dtype)
+        self.model.eval()
 
     def process_prompt(
         self,
diff --git a/modules/core/models/tts/SparkTTS/SparkTTSModel.py b/modules/core/models/tts/SparkTTS/SparkTTSModel.py
@@ -5,6 +5,7 @@
 from typing import Literal, Optional, Union
 
 import soundfile as sf
+import torch
 
 from modules.core.models.tts.SparkTTS.SparkTTS import SparkTTS
 from modules.core.models.TTSModel import TTSModel
@@ -38,11 +39,25 @@ def check_files(self) -> None:
     def is_downloaded(self) -> bool:
         return self.model_path.exists()
 
+    def get_dtype(self):
+        dtype = super().get_dtype()
+        if dtype == torch.float16:
+            # NOTE: SparkTTS 模型对于 float16 精度很糟糕，几乎破坏了模型，无法运行
+            # NOTE: 你可以使用 `--bf16` 启动项开启 bfloat16 模式，虽然可以运行，但是还是容易生成大量空白
+            # NOTE: 所以，如果没有使用 bf16 又开启了 half ，那么将切换为 f32
+            logger.warning(
+                "检测到 dtype 为 float16，但 SparkTTS 对 float16 支持很差，已强制切换为 float32。"
+                "建议使用 --bf16 开启 bfloat16 模式以获得更好兼容性。"
+            )
+            return torch.float32
+        return dtype
+
     def load(self):
         if self.model is None:
-            # TODO: 配置 dtype
             self.model = SparkTTS(
-                model_dir=str(self.model_path), device=self.get_device()
+                model_dir=str(self.model_path),
+                device=self.get_device(),
+                dtype=self.get_dtype(),
             )
         return self.model
 
diff --git a/modules/devices/devices.py b/modules/devices/devices.py
@@ -145,8 +145,12 @@ def reset_device():
         config.runtime_env_vars.no_half = True
 
     if not config.runtime_env_vars.no_half:
-        dtype = torch.float16
-        logger.info("Using half precision: torch.float16")
+        if config.runtime_env_vars.bf16:
+            dtype = torch.bfloat16
+            logger.info("Using half precision: torch.bfloat16")
+        else:
+            dtype = torch.float16
+            logger.info("Using half precision: torch.float16")
     else:
         dtype = torch.float32
         logger.info("Using full precision: torch.float32")
diff --git a/modules/models_setup.py b/modules/models_setup.py
@@ -56,11 +56,18 @@ def setup_model_args(parser: argparse.ArgumentParser):
         action="store_true",
         help="Preload all models at startup",
     )
+    # NOTE: 开启 ftc 等于给 torch 预热，但是服务冷启动变慢
     parser.add_argument(
         "--ftc",
         action="store_true",
         help="Enable first time calculation",
     )
+    # NOTE: 不同模型可能有不同的适配度，比如 sparktts 只能使用 bfloat16 而不能使用 float16 ，所以某些模型半精度的情况需要开启这个
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+        help="Use bfloat16 as the data type when loading with half precision.",
+    )
 
 
 def process_model_args(args: argparse.Namespace):
@@ -75,6 +82,7 @@ def process_model_args(args: argparse.Namespace):
     debug_generate = env.get_and_update_env(args, "debug_generate", False, bool)
     preload_models = env.get_and_update_env(args, "preload_models", False, bool)
     enable_ftc = env.get_and_update_env(args, "ftc", False, bool)
+    bf16 = env.get_and_update_env(args, "bf16", False, bool)
 
     # TODO: 需要等 zoo 模块实现
     # generate_audio.setup_lru_cache()