Skip to content

Commit 9e15642

Browse files
authored
Merge pull request #821 from FunAudioLLM/dev/lyuxiang.lx
update
2 parents 6b21f8e + 99ab0f4 commit 9e15642

File tree

11 files changed

+164
-158
lines changed

11 files changed

+164
-158
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ import torchaudio
132132

133133
**CosyVoice2 Usage**
134134
```python
135-
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False)
135+
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False)
136136

137137
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
138138
# zero_shot usage
@@ -151,7 +151,7 @@ for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来
151151

152152
**CosyVoice Usage**
153153
```python
154-
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
154+
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
155155
# sft usage
156156
print(cosyvoice.list_available_spks())
157157
# change stream=True for chunk stream inference

cosyvoice/bin/export_jit.py

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
2424
sys.path.append('{}/../..'.format(ROOT_DIR))
2525
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
26-
from cosyvoice.cli.cosyvoice import CosyVoice
26+
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
2727

2828

2929
def get_args():
@@ -37,6 +37,16 @@ def get_args():
3737
return args
3838

3939

40+
def get_optimized_script(model, preserved_attrs=[]):
41+
script = torch.jit.script(model)
42+
if preserved_attrs != []:
43+
script = torch.jit.freeze(script, preserved_attrs=preserved_attrs)
44+
else:
45+
script = torch.jit.freeze(script)
46+
script = torch.jit.optimize_for_inference(script)
47+
return script
48+
49+
4050
def main():
4151
args = get_args()
4252
logging.basicConfig(level=logging.DEBUG,
@@ -46,28 +56,35 @@ def main():
4656
torch._C._jit_set_profiling_mode(False)
4757
torch._C._jit_set_profiling_executor(False)
4858

49-
cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_onnx=False)
59+
try:
60+
model = CosyVoice(args.model_dir)
61+
except Exception:
62+
try:
63+
model = CosyVoice2(args.model_dir)
64+
except Exception:
65+
raise TypeError('no valid model_type!')
5066

51-
# 1. export llm text_encoder
52-
llm_text_encoder = cosyvoice.model.llm.text_encoder.half()
53-
script = torch.jit.script(llm_text_encoder)
54-
script = torch.jit.freeze(script)
55-
script = torch.jit.optimize_for_inference(script)
56-
script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
67+
if not isinstance(model, CosyVoice2):
68+
# 1. export llm text_encoder
69+
llm_text_encoder = model.model.llm.text_encoder
70+
script = get_optimized_script(llm_text_encoder)
71+
script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
72+
script = get_optimized_script(llm_text_encoder.half())
73+
script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
5774

58-
# 2. export llm llm
59-
llm_llm = cosyvoice.model.llm.llm.half()
60-
script = torch.jit.script(llm_llm)
61-
script = torch.jit.freeze(script, preserved_attrs=['forward_chunk'])
62-
script = torch.jit.optimize_for_inference(script)
63-
script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
75+
# 2. export llm llm
76+
llm_llm = model.model.llm.llm
77+
script = get_optimized_script(llm_llm, ['forward_chunk'])
78+
script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
79+
script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
80+
script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
6481

6582
# 3. export flow encoder
66-
flow_encoder = cosyvoice.model.flow.encoder
67-
script = torch.jit.script(flow_encoder)
68-
script = torch.jit.freeze(script)
69-
script = torch.jit.optimize_for_inference(script)
83+
flow_encoder = model.model.flow.encoder
84+
script = get_optimized_script(flow_encoder)
7085
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
86+
script = get_optimized_script(flow_encoder.half())
87+
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
7188

7289

7390
if __name__ == '__main__':

cosyvoice/bin/export_onnx.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
2828
sys.path.append('{}/../..'.format(ROOT_DIR))
2929
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
30-
from cosyvoice.cli.cosyvoice import CosyVoice
30+
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
3131

3232

3333
def get_dummy_input(batch_size, seq_len, out_channels, device):
@@ -56,14 +56,20 @@ def main():
5656
logging.basicConfig(level=logging.DEBUG,
5757
format='%(asctime)s %(levelname)s %(message)s')
5858

59-
cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_onnx=False)
59+
try:
60+
model = CosyVoice(args.model_dir)
61+
except Exception:
62+
try:
63+
model = CosyVoice2(args.model_dir)
64+
except Exception:
65+
raise TypeError('no valid model_type!')
6066

6167
# 1. export flow decoder estimator
62-
estimator = cosyvoice.model.flow.decoder.estimator
68+
estimator = model.model.flow.decoder.estimator
6369

64-
device = cosyvoice.model.device
65-
batch_size, seq_len = 1, 256
66-
out_channels = cosyvoice.model.flow.decoder.estimator.out_channels
70+
device = model.model.device
71+
batch_size, seq_len = 2, 256
72+
out_channels = model.model.flow.decoder.estimator.out_channels
6773
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
6874
torch.onnx.export(
6975
estimator,
@@ -75,13 +81,11 @@ def main():
7581
input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
7682
output_names=['estimator_out'],
7783
dynamic_axes={
78-
'x': {0: 'batch_size', 2: 'seq_len'},
79-
'mask': {0: 'batch_size', 2: 'seq_len'},
80-
'mu': {0: 'batch_size', 2: 'seq_len'},
81-
'cond': {0: 'batch_size', 2: 'seq_len'},
82-
't': {0: 'batch_size'},
83-
'spks': {0: 'batch_size'},
84-
'estimator_out': {0: 'batch_size', 2: 'seq_len'},
84+
'x': {2: 'seq_len'},
85+
'mask': {2: 'seq_len'},
86+
'mu': {2: 'seq_len'},
87+
'cond': {2: 'seq_len'},
88+
'estimator_out': {2: 'seq_len'},
8589
}
8690
)
8791

@@ -94,7 +98,7 @@ def main():
9498
sess_options=option, providers=providers)
9599

96100
for _ in tqdm(range(10)):
97-
x, mask, mu, t, spks, cond = get_dummy_input(random.randint(1, 6), random.randint(16, 512), out_channels, device)
101+
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
98102
output_pytorch = estimator(x, mask, mu, t, spks, cond)
99103
ort_inputs = {
100104
'x': x.cpu().numpy(),

cosyvoice/bin/export_trt.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ TRT_DIR=<YOUR_TRT_DIR>
66
MODEL_DIR=<COSYVOICE2_MODEL_DIR>
77

88
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_DIR/lib:/usr/local/cuda/lib64
9+
$TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp32.mygpu.plan --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw --outputIOFormats=fp32:chw
910
$TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp16.mygpu.plan --fp16 --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw --outputIOFormats=fp16:chw

cosyvoice/cli/cosyvoice.py

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -25,35 +25,35 @@
2525

2626
class CosyVoice:
2727

28-
def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
28+
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
2929
self.instruct = True if '-Instruct' in model_dir else False
3030
self.model_dir = model_dir
31+
self.fp16 = fp16
3132
if not os.path.exists(model_dir):
3233
model_dir = snapshot_download(model_dir)
3334
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
3435
configs = load_hyperpyyaml(f)
35-
assert get_model_type(configs) == CosyVoiceModel, 'do not use {} for CosyVoice initialization!'.format(model_dir)
36+
assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
3637
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
3738
configs['feat_extractor'],
3839
'{}/campplus.onnx'.format(model_dir),
3940
'{}/speech_tokenizer_v1.onnx'.format(model_dir),
4041
'{}/spk2info.pt'.format(model_dir),
4142
configs['allowed_special'])
4243
self.sample_rate = configs['sample_rate']
43-
if torch.cuda.is_available() is False and (fp16 is True or load_jit is True):
44-
load_jit = False
45-
fp16 = False
46-
logging.warning('cpu do not support fp16 and jit, force set to False')
44+
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
45+
load_jit, load_trt, fp16 = False, False, False
46+
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
4747
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
4848
self.model.load('{}/llm.pt'.format(model_dir),
4949
'{}/flow.pt'.format(model_dir),
5050
'{}/hift.pt'.format(model_dir))
5151
if load_jit:
52-
self.model.load_jit('{}/llm.text_encoder.fp16.zip'.format(model_dir),
53-
'{}/llm.llm.fp16.zip'.format(model_dir),
54-
'{}/flow.encoder.fp32.zip'.format(model_dir))
55-
if load_onnx:
56-
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
52+
self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
53+
'{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
54+
'{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
55+
if load_trt:
56+
self.model.load_trt('{}/flow.decoder.estimator.{}.v100.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
5757
del configs
5858

5959
def list_available_spks(self):
@@ -123,9 +123,10 @@ def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed
123123

124124
class CosyVoice2(CosyVoice):
125125

126-
def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
126+
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
127127
self.instruct = True if '-Instruct' in model_dir else False
128128
self.model_dir = model_dir
129+
self.fp16 = fp16
129130
if not os.path.exists(model_dir):
130131
model_dir = snapshot_download(model_dir)
131132
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
@@ -138,22 +139,17 @@ def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
138139
'{}/spk2info.pt'.format(model_dir),
139140
configs['allowed_special'])
140141
self.sample_rate = configs['sample_rate']
141-
if torch.cuda.is_available() is False and load_jit is True:
142-
load_jit = False
143-
logging.warning('cpu do not support jit, force set to False')
144-
self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'])
142+
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
143+
load_jit, load_trt, fp16 = False, False, False
144+
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
145+
self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
145146
self.model.load('{}/llm.pt'.format(model_dir),
146147
'{}/flow.pt'.format(model_dir),
147148
'{}/hift.pt'.format(model_dir))
148149
if load_jit:
149-
self.model.load_jit('{}/flow.encoder.fp32.zip'.format(model_dir))
150-
if load_trt is True and load_onnx is True:
151-
load_onnx = False
152-
logging.warning('can not set both load_trt and load_onnx to True, force set load_onnx to False')
153-
if load_onnx:
154-
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
150+
self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
155151
if load_trt:
156-
self.model.load_trt('{}/flow.decoder.estimator.fp16.Volta.plan'.format(model_dir))
152+
self.model.load_trt('{}/flow.decoder.estimator.{}.v100.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
157153
del configs
158154

159155
def inference_instruct(self, *args, **kwargs):

0 commit comments

Comments
 (0)