Skip to content

Commit 0ca50ec

Browse files
committed
support XPU
1 parent 41c5e8c commit 0ca50ec

File tree

5 files changed

+24
-11
lines changed

5 files changed

+24
-11
lines changed

README.md

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,20 +53,27 @@
5353

5454
- Clone the repo
5555
``` sh
56-
git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
56+
git clone --recursive https://github.com/DDXDB/CosyVoice-XPU.git
5757
# If you failed to clone submodule due to network failures, please run following command until success
58-
cd CosyVoice
58+
cd CosyVoice-XPU
5959
git submodule update --init --recursive
6060
```
6161

62+
- Install `Intel® Deep Learning Essentials` or `Intel® oneAPI Base Toolkit`
63+
- please see:https://pytorch.org/docs/main/notes/get_start_xpu.html
6264
- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
6365
- Create Conda env:
6466

6567
``` sh
66-
conda create -n cosyvoice -y python=3.10
67-
conda activate cosyvoice
68+
conda create -n cosyvoice-XPU -y python=3.10
69+
conda activate cosyvoice-XPU
6870
# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
6971
conda install -y -c conda-forge pynini==2.1.5
72+
# Start oneAPI env and Install pytorch+XPU
73+
call C:\Program Files (x86)\Intel\oneAPI\compiler\2025.0\env\vars.bat
74+
call C:\Program Files (x86)\Intel\oneAPI\ocloc\2025.0\env\vars.bat
75+
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
76+
7077
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
7178

7279
# If you encounter sox compatibility issues

cosyvoice/cli/cosyvoice.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
4444
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
4545
load_jit, load_trt, fp16 = False, False, False
4646
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
47+
if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
48+
load_jit, load_trt, fp16 = False, False, False
49+
logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
4750
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
4851
self.model.load('{}/llm.pt'.format(model_dir),
4952
'{}/flow.pt'.format(model_dir),
@@ -144,6 +147,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
144147
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
145148
load_jit, load_trt, fp16 = False, False, False
146149
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
150+
if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
151+
load_jit, load_trt, fp16 = False, False, False
152+
logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
147153
self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
148154
self.model.load('{}/llm.pt'.format(model_dir),
149155
'{}/flow.pt'.format(model_dir),

cosyvoice/cli/frontend.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,14 @@ def __init__(self,
4545
allowed_special: str = 'all'):
4646
self.tokenizer = get_tokenizer()
4747
self.feat_extractor = feat_extractor
48-
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
48+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
4949
option = onnxruntime.SessionOptions()
5050
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
5151
option.intra_op_num_threads = 1
5252
self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
5353
self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
5454
providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
55+
"CUDAExecutionProvider" if torch.xpu.is_available() else
5556
"CPUExecutionProvider"])
5657
if os.path.exists(spk2info):
5758
self.spk2info = torch.load(spk2info, map_location=self.device)

cosyvoice/cli/model.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self,
3030
flow: torch.nn.Module,
3131
hift: torch.nn.Module,
3232
fp16: bool):
33-
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
33+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
3434
self.llm = llm
3535
self.flow = flow
3636
self.hift = hift
@@ -56,7 +56,7 @@ def __init__(self,
5656
# rtf and decoding related
5757
self.stream_scale_factor = 1
5858
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
59-
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
59+
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
6060
self.lock = threading.Lock()
6161
# dict used to store session related variable
6262
self.tts_speech_token_dict = {}
@@ -275,7 +275,7 @@ def __init__(self,
275275
flow: torch.nn.Module,
276276
hift: torch.nn.Module,
277277
fp16: bool):
278-
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
278+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
279279
self.llm = llm
280280
self.flow = flow
281281
self.hift = hift
@@ -296,7 +296,7 @@ def __init__(self,
296296
self.speech_window = np.hamming(2 * self.source_cache_len)
297297
# rtf and decoding related
298298
self.stream_scale_factor = 1
299-
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
299+
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
300300
self.lock = threading.Lock()
301301
# dict used to store session related variable
302302
self.tts_speech_token_dict = {}

requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ tensorboard==2.14.0
2828
tensorrt-cu12==10.0.1; sys_platform == 'linux'
2929
tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
3030
tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
31-
torch==2.3.1
32-
torchaudio==2.3.1
31+
3332
transformers==4.40.1
3433
uvicorn==0.30.0
3534
wget==3.2

0 commit comments

Comments
 (0)