Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions cosyvoice/cli/cosyvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,21 @@
from hyperpyyaml import load_hyperpyyaml
from modelscope import snapshot_download
import torch
# try:
# import intel_extension_for_pytorch as ipex
# except Exception:
# pass
#
# if torch.xpu.is_available():
# from ipex_to_cuda import ipex_init
# ipex_active, message = ipex_init()
# print(f"IPEX Active: {ipex_active} Message: {message}")
#
#
# if torch.cuda.is_available():
# if hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
# print("IPEX to CUDA is working!")

from cosyvoice.cli.frontend import CosyVoiceFrontEnd
from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
from cosyvoice.utils.file_utils import logging
Expand Down Expand Up @@ -48,6 +63,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_co
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
load_jit, load_trt, fp16 = False, False, False
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
load_jit, load_trt, fp16 = False, False, False
logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
self.model.load('{}/llm.pt'.format(model_dir),
'{}/flow.pt'.format(model_dir),
Expand Down Expand Up @@ -163,6 +181,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, f
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
load_jit, load_trt, fp16 = False, False, False
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
load_jit, load_trt, fp16 = False, False, False
logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
self.model.load('{}/llm.pt'.format(model_dir),
'{}/flow.pt'.format(model_dir),
Expand Down
18 changes: 17 additions & 1 deletion cosyvoice/cli/frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@
import json
import onnxruntime
import torch
# try:
# import intel_extension_for_pytorch as ipex
# except Exception:
# pass
#
# if torch.xpu.is_available():
# from ipex_to_cuda import ipex_init
# ipex_active, message = ipex_init()
# print(f"IPEX Active: {ipex_active} Message: {message}")
#
#
# if torch.cuda.is_available():
# if hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
# print("IPEX to CUDA is working!")

import numpy as np
import whisper
from typing import Callable
Expand Down Expand Up @@ -47,13 +62,14 @@ def __init__(self,
allowed_special: str = 'all'):
self.tokenizer = get_tokenizer()
self.feat_extractor = feat_extractor
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
option = onnxruntime.SessionOptions()
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
option.intra_op_num_threads = 1
self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
"CUDAExecutionProvider" if torch.xpu.is_available() else
"CPUExecutionProvider"])
if os.path.exists(spk2info):
self.spk2info = torch.load(spk2info, map_location=self.device)
Expand Down
25 changes: 21 additions & 4 deletions cosyvoice/cli/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,21 @@
import os
from typing import Generator
import torch
# try:
# import intel_extension_for_pytorch as ipex
# except Exception:
# pass
#
# if torch.xpu.is_available():
# from ipex_to_cuda import ipex_init
# ipex_active, message = ipex_init()
# print(f"IPEX Active: {ipex_active} Message: {message}")
#
#
# if torch.cuda.is_available():
# if hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
# print("IPEX to CUDA is working!")

import numpy as np
import threading
import time
Expand All @@ -33,7 +48,7 @@ def __init__(self,
flow: torch.nn.Module,
hift: torch.nn.Module,
fp16: bool = False):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
self.llm = llm
self.flow = flow
self.hift = hift
Expand All @@ -55,7 +70,9 @@ def __init__(self,
# rtf and decoding related
self.stream_scale_factor = 1
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
self.llm_context = torch.cuda.stream(
torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(
torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
self.lock = threading.Lock()
# dict used to store session related variable
self.tts_speech_token_dict = {}
Expand Down Expand Up @@ -244,7 +261,7 @@ def __init__(self,
flow: torch.nn.Module,
hift: torch.nn.Module,
fp16: bool = False):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
self.llm = llm
self.flow = flow
self.hift = hift
Expand All @@ -260,7 +277,7 @@ def __init__(self,
# speech fade in out
self.speech_window = np.hamming(2 * self.source_cache_len)
# rtf and decoding related
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
self.lock = threading.Lock()
# dict used to store session related variable
self.tts_speech_token_dict = {}
Expand Down
3 changes: 1 addition & 2 deletions webui.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import os
import sys
import argparse
import gradio as gr
import numpy as np
import torch
import torchaudio
Expand All @@ -25,7 +24,7 @@
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav, logging
from cosyvoice.utils.common import set_all_random_seed

import gradio as gr
inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
'3s极速复刻': '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',
Expand Down