diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index cc443bed..ccf4a555 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -18,6 +18,21 @@ from hyperpyyaml import load_hyperpyyaml from modelscope import snapshot_download import torch +# try: +# import intel_extension_for_pytorch as ipex +# except Exception: +# pass +# +# if torch.xpu.is_available(): +# from ipex_to_cuda import ipex_init +# ipex_active, message = ipex_init() +# print(f"IPEX Active: {ipex_active} Message: {message}") +# +# +# if torch.cuda.is_available(): +# if hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked: +# print("IPEX to CUDA is working!") + from cosyvoice.cli.frontend import CosyVoiceFrontEnd from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model from cosyvoice.utils.file_utils import logging @@ -48,6 +63,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_co if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): load_jit, load_trt, fp16 = False, False, False logging.warning('no cuda device, set load_jit/load_trt/fp16 to False') + if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): + load_jit, load_trt, fp16 = False, False, False + logging.warning('no xpu device, set load_jit/load_trt/fp16 to False') self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), @@ -163,6 +181,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, f if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): load_jit, load_trt, fp16 = False, False, False logging.warning('no cuda device, set load_jit/load_trt/fp16 to False') + if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): + load_jit, load_trt, fp16 = False, False, False + logging.warning('no xpu device, set load_jit/load_trt/fp16 to False') self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index f98b0d61..8831a923 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -16,6 +16,21 @@ import json import onnxruntime import torch +# try: +# import intel_extension_for_pytorch as ipex +# except Exception: +# pass +# +# if torch.xpu.is_available(): +# from ipex_to_cuda import ipex_init +# ipex_active, message = ipex_init() +# print(f"IPEX Active: {ipex_active} Message: {message}") +# +# +# if torch.cuda.is_available(): +# if hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked: +# print("IPEX to CUDA is working!") + import numpy as np import whisper from typing import Callable @@ -47,13 +62,14 @@ def __init__(self, allowed_special: str = 'all'): self.tokenizer = get_tokenizer() self.feat_extractor = feat_extractor - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu') option = onnxruntime.SessionOptions() option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL option.intra_op_num_threads = 1 self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"]) self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option, providers=["CUDAExecutionProvider" if torch.cuda.is_available() else + "CUDAExecutionProvider" if torch.xpu.is_available() else "CPUExecutionProvider"]) if os.path.exists(spk2info): self.spk2info = torch.load(spk2info, map_location=self.device) diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 9c8ac7e7..cc289ff4 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -15,6 +15,21 @@ import os from typing import Generator import torch +# try: +# import intel_extension_for_pytorch as ipex +# except Exception: +# pass +# +# if torch.xpu.is_available(): +# from ipex_to_cuda import ipex_init +# ipex_active, message = ipex_init() +# print(f"IPEX Active: {ipex_active} Message: {message}") +# +# +# if torch.cuda.is_available(): +# if hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked: +# print("IPEX to CUDA is working!") + import numpy as np import threading import time @@ -33,7 +48,7 @@ def __init__(self, flow: torch.nn.Module, hift: torch.nn.Module, fp16: bool = False): - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu') self.llm = llm self.flow = flow self.hift = hift @@ -55,7 +70,9 @@ def __init__(self, # rtf and decoding related self.stream_scale_factor = 1 assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf' - self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() + self.llm_context = torch.cuda.stream( + torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream( + torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext() self.lock = threading.Lock() # dict used to store session related variable self.tts_speech_token_dict = {} @@ -244,7 +261,7 @@ def __init__(self, flow: torch.nn.Module, hift: torch.nn.Module, fp16: bool = False): - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu') self.llm = llm self.flow = flow self.hift = hift @@ -260,7 +277,7 @@ def __init__(self, # speech fade in out self.speech_window = np.hamming(2 * self.source_cache_len) # rtf and decoding related - self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() + self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext() self.lock = threading.Lock() # dict used to store session related variable self.tts_speech_token_dict = {} diff --git a/webui.py b/webui.py index 3552cd92..659a2014 100644 --- a/webui.py +++ b/webui.py @@ -14,7 +14,6 @@ import os import sys import argparse -import gradio as gr import numpy as np import torch import torchaudio @@ -25,7 +24,7 @@ from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 from cosyvoice.utils.file_utils import load_wav, logging from cosyvoice.utils.common import set_all_random_seed - +import gradio as gr inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制'] instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮', '3s极速复刻': '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',