support XPU

DDXDB · DDXDB · commit 0ca50ec3b9f5 · 2025-01-17T02:30:57.000+08:00
diff --git a/README.md b/README.md
@@ -53,20 +53,27 @@
 
 - Clone the repo
 ``` sh
-git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
+git clone --recursive https://github.com/DDXDB/CosyVoice-XPU.git
 # If you failed to clone submodule due to network failures, please run following command until success
-cd CosyVoice
+cd CosyVoice-XPU
 git submodule update --init --recursive
 ```
 
+- Install `Intel® Deep Learning Essentials` or `Intel® oneAPI Base Toolkit`
+- please see：https://pytorch.org/docs/main/notes/get_start_xpu.html
 - Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
 - Create Conda env:
 
 ``` sh
-conda create -n cosyvoice -y python=3.10
-conda activate cosyvoice
+conda create -n cosyvoice-XPU -y python=3.10
+conda activate cosyvoice-XPU
 # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
 conda install -y -c conda-forge pynini==2.1.5
+# Start oneAPI env and Install pytorch+XPU
+call C:\Program Files (x86)\Intel\oneAPI\compiler\2025.0\env\vars.bat
+call C:\Program Files (x86)\Intel\oneAPI\ocloc\2025.0\env\vars.bat
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
+
 pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
 
 # If you encounter sox compatibility issues
diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
@@ -44,6 +44,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
         if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
             load_jit, load_trt, fp16 = False, False, False
             logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
+            if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
+                load_jit, load_trt, fp16 = False, False, False
+                logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
         self.model.load('{}/llm.pt'.format(model_dir),
                         '{}/flow.pt'.format(model_dir),
@@ -144,6 +147,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
         if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
             load_jit, load_trt, fp16 = False, False, False
             logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
+            if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
+                load_jit, load_trt, fp16 = False, False, False
+                logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
         self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
         self.model.load('{}/llm.pt'.format(model_dir),
                         '{}/flow.pt'.format(model_dir),
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
@@ -45,13 +45,14 @@ def __init__(self,
                  allowed_special: str = 'all'):
         self.tokenizer = get_tokenizer()
         self.feat_extractor = feat_extractor
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
         option = onnxruntime.SessionOptions()
         option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
         option.intra_op_num_threads = 1
         self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
         self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
                                                                      providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
+                                                                                "CUDAExecutionProvider" if torch.xpu.is_available() else
                                                                                 "CPUExecutionProvider"])
         if os.path.exists(spk2info):
             self.spk2info = torch.load(spk2info, map_location=self.device)
diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
@@ -30,7 +30,7 @@ def __init__(self,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module,
                  fp16: bool):
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
         self.llm = llm
         self.flow = flow
         self.hift = hift
@@ -56,7 +56,7 @@ def __init__(self,
         # rtf and decoding related
         self.stream_scale_factor = 1
         assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
-        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
         self.lock = threading.Lock()
         # dict used to store session related variable
         self.tts_speech_token_dict = {}
@@ -275,7 +275,7 @@ def __init__(self,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module,
                  fp16: bool):
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
         self.llm = llm
         self.flow = flow
         self.hift = hift
@@ -296,7 +296,7 @@ def __init__(self,
         self.speech_window = np.hamming(2 * self.source_cache_len)
         # rtf and decoding related
         self.stream_scale_factor = 1
-        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
         self.lock = threading.Lock()
         # dict used to store session related variable
         self.tts_speech_token_dict = {}
diff --git a/requirements.txt b/requirements.txt
@@ -28,8 +28,7 @@ tensorboard==2.14.0
 tensorrt-cu12==10.0.1; sys_platform == 'linux'
 tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
 tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
-torch==2.3.1
-torchaudio==2.3.1
+
 transformers==4.40.1
 uvicorn==0.30.0
 wget==3.2