|
| 1 | +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +from __future__ import print_function |
| 16 | + |
| 17 | +import argparse |
| 18 | +import logging |
| 19 | +logging.getLogger('matplotlib').setLevel(logging.WARNING) |
| 20 | +import os |
| 21 | + |
| 22 | +import torch |
| 23 | +from torch.utils.data import DataLoader |
| 24 | +import torchaudio |
| 25 | +from hyperpyyaml import load_hyperpyyaml |
| 26 | +from tqdm import tqdm |
| 27 | +from cosyvoice.cli.model import CosyVoiceModel |
| 28 | + |
| 29 | +from cosyvoice.dataset.dataset import Dataset |
| 30 | + |
| 31 | +def get_args(): |
| 32 | + parser = argparse.ArgumentParser(description='inference with your model') |
| 33 | + parser.add_argument('--config', required=True, help='config file') |
| 34 | + parser.add_argument('--prompt_data', required=True, help='prompt data file') |
| 35 | + parser.add_argument('--prompt_utt2data', required=True, help='prompt data file') |
| 36 | + parser.add_argument('--tts_text', required=True, help='tts input file') |
| 37 | + parser.add_argument('--llm_model', required=True, help='llm model file') |
| 38 | + parser.add_argument('--flow_model', required=True, help='flow model file') |
| 39 | + parser.add_argument('--hifigan_model', required=True, help='hifigan model file') |
| 40 | + parser.add_argument('--gpu', |
| 41 | + type=int, |
| 42 | + default=-1, |
| 43 | + help='gpu id for this rank, -1 for cpu') |
| 44 | + parser.add_argument('--mode', |
| 45 | + default='sft', |
| 46 | + choices=['sft', 'zero_shot'], |
| 47 | + help='inference mode') |
| 48 | + parser.add_argument('--result_dir', required=True, help='asr result file') |
| 49 | + args = parser.parse_args() |
| 50 | + print(args) |
| 51 | + return args |
| 52 | + |
| 53 | + |
| 54 | +def main(): |
| 55 | + args = get_args() |
| 56 | + logging.basicConfig(level=logging.DEBUG, |
| 57 | + format='%(asctime)s %(levelname)s %(message)s') |
| 58 | + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) |
| 59 | + |
| 60 | + # Init cosyvoice models from configs |
| 61 | + use_cuda = args.gpu >= 0 and torch.cuda.is_available() |
| 62 | + device = torch.device('cuda' if use_cuda else 'cpu') |
| 63 | + with open(args.config, 'r') as f: |
| 64 | + configs = load_hyperpyyaml(f) |
| 65 | + |
| 66 | + model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) |
| 67 | + model.load(args.llm_model, args.flow_model, args.hifigan_model) |
| 68 | + |
| 69 | + test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data) |
| 70 | + test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) |
| 71 | + |
| 72 | + del configs |
| 73 | + os.makedirs(args.result_dir, exist_ok=True) |
| 74 | + fn = os.path.join(args.result_dir, 'wav.scp') |
| 75 | + f = open(fn, 'w') |
| 76 | + with torch.no_grad(): |
| 77 | + for batch_idx, batch in tqdm(enumerate(test_data_loader)): |
| 78 | + utts = batch["utts"] |
| 79 | + assert len(utts) == 1, "inference mode only support batchsize 1" |
| 80 | + text = batch["text"] |
| 81 | + text_token = batch["text_token"].to(device) |
| 82 | + text_token_len = batch["text_token_len"].to(device) |
| 83 | + tts_text = batch["tts_text"] |
| 84 | + tts_index = batch["tts_index"] |
| 85 | + tts_text_token = batch["tts_text_token"].to(device) |
| 86 | + tts_text_token_len = batch["tts_text_token_len"].to(device) |
| 87 | + speech_token = batch["speech_token"].to(device) |
| 88 | + speech_token_len = batch["speech_token_len"].to(device) |
| 89 | + speech_feat = batch["speech_feat"].to(device) |
| 90 | + speech_feat_len = batch["speech_feat_len"].to(device) |
| 91 | + utt_embedding = batch["utt_embedding"].to(device) |
| 92 | + spk_embedding = batch["spk_embedding"].to(device) |
| 93 | + if args.mode == 'sft': |
| 94 | + model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, |
| 95 | + 'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding} |
| 96 | + else: |
| 97 | + model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, |
| 98 | + 'prompt_text': text_token, 'prompt_text_len': text_token_len, |
| 99 | + 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len, |
| 100 | + 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len, |
| 101 | + 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len, |
| 102 | + 'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding} |
| 103 | + model_output = model.inference(**model_input) |
| 104 | + tts_key = '{}_{}'.format(utts[0], tts_index[0]) |
| 105 | + tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key)) |
| 106 | + torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050) |
| 107 | + f.write('{} {}\n'.format(tts_key, tts_fn)) |
| 108 | + f.flush() |
| 109 | + f.close() |
| 110 | + logging.info('Result wav.scp saved in {}'.format(fn)) |
| 111 | + |
| 112 | + |
| 113 | +if __name__ == '__main__': |
| 114 | + main() |
0 commit comments