FunAudioLLM · DDXDB · Sep 1, 2025
diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
@@ -18,6 +18,21 @@
 from hyperpyyaml import load_hyperpyyaml
 from modelscope import snapshot_download
 import torch
+# try:
+#     import intel_extension_for_pytorch as ipex
+# except Exception:
+#     pass
+#
+# if torch.xpu.is_available():
+#     from ipex_to_cuda import ipex_init
+#     ipex_active, message = ipex_init()
+#     print(f"IPEX Active: {ipex_active} Message: {message}")
+#
+#
+# if torch.cuda.is_available():
+#     if hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
+#         print("IPEX to CUDA is working!")
+
 from cosyvoice.cli.frontend import CosyVoiceFrontEnd
 from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
 from cosyvoice.utils.file_utils import logging
@@ -48,6 +63,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_co
         if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
             load_jit, load_trt, fp16 = False, False, False
             logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
+            if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
+                load_jit, load_trt, fp16 = False, False, False
+                logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
         self.model.load('{}/llm.pt'.format(model_dir),
                         '{}/flow.pt'.format(model_dir),
@@ -163,6 +181,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, f
         if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
             load_jit, load_trt, fp16 = False, False, False
             logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
+            if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
+                load_jit, load_trt, fp16 = False, False, False
+                logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
         self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
         self.model.load('{}/llm.pt'.format(model_dir),
                         '{}/flow.pt'.format(model_dir),

diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
@@ -16,6 +16,21 @@
 import json
 import onnxruntime
 import torch
+# try:
+#     import intel_extension_for_pytorch as ipex
+# except Exception:
+#     pass
+#
+# if torch.xpu.is_available():
+#     from ipex_to_cuda import ipex_init
+#     ipex_active, message = ipex_init()
+#     print(f"IPEX Active: {ipex_active} Message: {message}")
+#
+#
+# if torch.cuda.is_available():
+#     if hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
+#         print("IPEX to CUDA is working!")
+
 import numpy as np
 import whisper
 from typing import Callable
@@ -47,13 +62,14 @@ def __init__(self,
                  allowed_special: str = 'all'):
         self.tokenizer = get_tokenizer()
         self.feat_extractor = feat_extractor
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
         option = onnxruntime.SessionOptions()
         option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
         option.intra_op_num_threads = 1
         self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
         self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
                                                                      providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
+                                                                                "CUDAExecutionProvider" if torch.xpu.is_available() else
                                                                                 "CPUExecutionProvider"])
         if os.path.exists(spk2info):
             self.spk2info = torch.load(spk2info, map_location=self.device)

diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
@@ -15,6 +15,21 @@
 import os
 from typing import Generator
 import torch
+# try:
+#     import intel_extension_for_pytorch as ipex
+# except Exception:
+#     pass
+#
+# if torch.xpu.is_available():
+#     from ipex_to_cuda import ipex_init
+#     ipex_active, message = ipex_init()
+#     print(f"IPEX Active: {ipex_active} Message: {message}")
+#
+#
+# if torch.cuda.is_available():
+#     if hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
+#         print("IPEX to CUDA is working!")
+
 import numpy as np
 import threading
 import time
@@ -33,7 +48,7 @@ def __init__(self,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module,
                  fp16: bool = False):
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
         self.llm = llm
         self.flow = flow
         self.hift = hift
@@ -55,7 +70,9 @@ def __init__(self,
         # rtf and decoding related
         self.stream_scale_factor = 1
         assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
-        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.llm_context = torch.cuda.stream(
+            torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(
+            torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
         self.lock = threading.Lock()
         # dict used to store session related variable
         self.tts_speech_token_dict = {}
@@ -244,7 +261,7 @@ def __init__(self,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module,
                  fp16: bool = False):
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
         self.llm = llm
         self.flow = flow
         self.hift = hift
@@ -260,7 +277,7 @@ def __init__(self,
         # speech fade in out
         self.speech_window = np.hamming(2 * self.source_cache_len)
         # rtf and decoding related
-        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
         self.lock = threading.Lock()
         # dict used to store session related variable
         self.tts_speech_token_dict = {}

diff --git a/webui.py b/webui.py
@@ -14,7 +14,6 @@
 import os
 import sys
 import argparse
-import gradio as gr
 import numpy as np
 import torch
 import torchaudio
@@ -25,7 +24,7 @@
 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
 from cosyvoice.utils.file_utils import load_wav, logging
 from cosyvoice.utils.common import set_all_random_seed
-
+import gradio as gr
 inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
 instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
                  '3s极速复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',