diff --git a/Linear_test.py b/Linear_test.py new file mode 100644 index 0000000000..0ee1d20836 --- /dev/null +++ b/Linear_test.py @@ -0,0 +1,11 @@ +import paddle,torch,numpy +torch_linear = torch.load("q.pt").cpu() +paddle_linear_state = paddle.load("q.pdparams") +paddle_linear = paddle.nn.Linear(896,896,bias_attr=True) +hidden_states = paddle.load("hidden_states.pdparams") +paddle_linear.set_state_dict(paddle_linear_state) +torch_forward_res = torch_linear(torch.tensor(hidden_states.numpy())) +paddle_forward_res = paddle_linear(hidden_states) +print("torch_forward_res:",torch_forward_res) +print("paddle_forward_res:",paddle_forward_res) +print('allclose_res:',numpy.testing.assert_allclose(torch_forward_res.detach().numpy(),paddle_forward_res)) \ No newline at end of file diff --git a/hidden_states.pdparams b/hidden_states.pdparams new file mode 100644 index 0000000000..94c37f8b5a Binary files /dev/null and b/hidden_states.pdparams differ diff --git a/paddlespeech/cli/tts/cosyvoice.py b/paddlespeech/cli/tts/cosyvoice.py new file mode 100644 index 0000000000..7ebe4dc806 --- /dev/null +++ b/paddlespeech/cli/tts/cosyvoice.py @@ -0,0 +1,34 @@ +from paddlespeech.t2s.models.CosyVoice.cosyvoice import CosyVoice2 +import sys +from paddlenlp.transformers import AutoTokenizer, AutoModelForCausalLM +from pathlib import Path +import paddle +import torch +from paddlespeech.t2s.models.CosyVoice.llm import Qwen2LM,ras_sampling,Qwen2Encoder +# cosyvoice_model = CosyVoice2("../CosyVoice/pretrained_models/CosyVoice2-0.5B_paddle") +model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2-0.5B') +llm = Qwen2Encoder(model) +qwen_lm = Qwen2LM(896,896,6561,llm,ras_sampling) +state_dict = paddle.load("/root/paddlejob/workspace/zhangjinghong/CosyVoice/pretrained_models/CosyVoice2-0.5B_paddle/llm.pdparams") +qwen_lm.set_state_dict(state_dict) +new_dict = torch.load("data.pt") +text = new_dict['text'] +text_len = new_dict['text_len'] +prompt_text = new_dict['prompt_text'] +prompt_text_len = new_dict['prompt_text_len'] +prompt_speech_token = new_dict['prompt_speech_token'] +prompt_speech_token_len = new_dict['prompt_speech_token_len'] +embedding = new_dict['embedding'] +uuid = new_dict['uuid'] +print("text:",text) +# for i in qwen_lm.inference(text=paddle.to_tensor(text), +# text_len=text_len, +# prompt_text=paddle.to_tensor(prompt_text), +# prompt_text_len=prompt_text_len, +# prompt_speech_token=paddle.to_tensor(prompt_speech_token), +# prompt_speech_token_len=prompt_speech_token_len, +# embedding=paddle.to_tensor(embedding,dtype = 'float32'), +# uuid=uuid): +# print(text) +# print(i) + diff --git a/paddlespeech/t2s/models/CosyVoice/__init__.py b/paddlespeech/t2s/models/CosyVoice/__init__.py new file mode 100644 index 0000000000..e0b064387d --- /dev/null +++ b/paddlespeech/t2s/models/CosyVoice/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .cosyvoice import * diff --git a/paddlespeech/t2s/models/CosyVoice/cosyvoice.py b/paddlespeech/t2s/models/CosyVoice/cosyvoice.py new file mode 100644 index 0000000000..7f352f0e26 --- /dev/null +++ b/paddlespeech/t2s/models/CosyVoice/cosyvoice.py @@ -0,0 +1,360 @@ +import os +import time +from typing import Generator + +import paddle +from hyperpyyaml import load_hyperpyyaml +from modelscope import snapshot_download +import logging +logging.getLogger('matplotlib').setLevel(logging.WARNING) +logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') +from paddlespeech.t2s.models.CosyVoice.frontend import CosyVoiceFrontEnd +from paddlespeech.t2s.models.CosyVoice.model import CosyVoice2Model + +def get_model_type(configs): + # NOTE CosyVoice2Model inherits CosyVoiceModel + if isinstance(configs['llm'], TransformerLM) and isinstance(configs['flow'], MaskedDiffWithXvec) and isinstance(configs['hift'], HiFTGenerator): + return CosyVoiceModel + if isinstance(configs['llm'], Qwen2LM) and isinstance(configs['flow'], CausalMaskedDiffWithXvec) and isinstance(configs['hift'], HiFTGenerator): + return CosyVoice2Model + raise TypeError('No valid model type found!') +class CosyVoice: + def __init__( + self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1 + ): + self.instruct = True if "-Instruct" in model_dir else False + self.model_dir = model_dir + self.fp16 = fp16 + if not os.path.exists(model_dir): + model_dir = snapshot_download(model_dir) + hyper_yaml_path = "{}/cosyvoice.yaml".format(model_dir) + if not os.path.exists(hyper_yaml_path): + raise ValueError("{} not found!".format(hyper_yaml_path)) + with open(hyper_yaml_path, "r") as f: + configs = load_hyperpyyaml(f) + # assert ( + # get_model_type(configs) != CosyVoice2Model + # ), "do not use {} for CosyVoice initialization!".format(model_dir) + self.frontend = CosyVoiceFrontEnd( + configs["get_tokenizer"], + configs["feat_extractor"], + "{}/campplus.onnx".format(model_dir), + "{}/speech_tokenizer_v1.onnx".format(model_dir), + "{}/spk2info.pt".format(model_dir), + configs["allowed_special"], + ) + self.sample_rate = configs["sample_rate"] + if (paddle.device.cuda.device_count() >= 1) is False and ( + load_jit is True or load_trt is True or fp16 is True + ): + load_jit, load_trt, fp16 = False, False, False + logging.warning("no cuda device, set load_jit/load_trt/fp16 to False") + self.model = CosyVoiceModel( + configs["llm"], configs["flow"], configs["hift"], fp16 + ) + self.model.load( + "{}/llm.pt".format(model_dir), + "{}/flow.pt".format(model_dir), + "{}/hift.pt".format(model_dir), + ) + if load_jit: + self.model.load_jit( + "{}/llm.text_encoder.{}.zip".format( + model_dir, "fp16" if self.fp16 is True else "fp32" + ), + "{}/llm.llm.{}.zip".format( + model_dir, "fp16" if self.fp16 is True else "fp32" + ), + "{}/flow.encoder.{}.zip".format( + model_dir, "fp16" if self.fp16 is True else "fp32" + ), + ) + if load_trt: + self.model.load_trt( + "{}/flow.decoder.estimator.{}.mygpu.plan".format( + model_dir, "fp16" if self.fp16 is True else "fp32" + ), + "{}/flow.decoder.estimator.fp32.onnx".format(model_dir), + trt_concurrent, + self.fp16, + ) + del configs + + def list_available_spks(self): + spks = list(self.frontend.spk2info.keys()) + return spks + + def add_zero_shot_spk(self, prompt_text, prompt_speech_16k, zero_shot_spk_id): + assert zero_shot_spk_id != "", "do not use empty zero_shot_spk_id" + model_input = self.frontend.frontend_zero_shot( + "", prompt_text, prompt_speech_16k, self.sample_rate, "" + ) + del model_input["text"] + del model_input["text_len"] + self.frontend.spk2info[zero_shot_spk_id] = model_input + return True + + def save_spkinfo(self): + paddle.save( + obj=self.frontend.spk2info, path="{}/spk2info.pt".format(self.model_dir) + ) + + def inference_sft( + self, tts_text, spk_id, stream=False, speed=1.0, text_frontend=True + ): + for i in tqdm( + self.frontend.text_normalize( + tts_text, split=True, text_frontend=text_frontend + ) + ): + model_input = self.frontend.frontend_sft(i, spk_id) + start_time = time.time() + logging.info("synthesis text {}".format(i)) + for model_output in self.model.tts( + **model_input, stream=stream, speed=speed + ): + speech_len = model_output["tts_speech"].shape[1] / self.sample_rate + logging.info( + "yield speech len {}, rtf {}".format( + speech_len, (time.time() - start_time) / speech_len + ) + ) + yield model_output + start_time = time.time() + + def inference_zero_shot( + self, + tts_text, + prompt_text, + prompt_speech_16k, + zero_shot_spk_id="", + stream=False, + speed=1.0, + text_frontend=True, + ): + prompt_text = self.frontend.text_normalize( + prompt_text, split=False, text_frontend=text_frontend + ) + for i in tqdm( + self.frontend.text_normalize( + tts_text, split=True, text_frontend=text_frontend + ) + ): + if not isinstance(i, Generator) and len(i) < 0.5 * len(prompt_text): + logging.warning( + "synthesis text {} too short than prompt text {}, this may lead to bad performance".format( + i, prompt_text + ) + ) + model_input = self.frontend.frontend_zero_shot( + i, prompt_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id + ) + start_time = time.time() + logging.info("synthesis text {}".format(i)) + for model_output in self.model.tts( + **model_input, stream=stream, speed=speed + ): + speech_len = model_output["tts_speech"].shape[1] / self.sample_rate + logging.info( + "yield speech len {}, rtf {}".format( + speech_len, (time.time() - start_time) / speech_len + ) + ) + yield model_output + start_time = time.time() + + def inference_cross_lingual( + self, + tts_text, + prompt_speech_16k, + zero_shot_spk_id="", + stream=False, + speed=1.0, + text_frontend=True, + ): + for i in tqdm( + self.frontend.text_normalize( + tts_text, split=True, text_frontend=text_frontend + ) + ): + model_input = self.frontend.frontend_cross_lingual( + i, prompt_speech_16k, self.sample_rate, zero_shot_spk_id + ) + start_time = time.time() + logging.info("synthesis text {}".format(i)) + for model_output in self.model.tts( + **model_input, stream=stream, speed=speed + ): + speech_len = model_output["tts_speech"].shape[1] / self.sample_rate + logging.info( + "yield speech len {}, rtf {}".format( + speech_len, (time.time() - start_time) / speech_len + ) + ) + yield model_output + start_time = time.time() + + def inference_instruct( + self, + tts_text, + spk_id, + instruct_text, + stream=False, + speed=1.0, + text_frontend=True, + ): + assert isinstance( + self.model, CosyVoiceModel + ), "inference_instruct is only implemented for CosyVoice!" + if self.instruct is False: + raise ValueError( + "{} do not support instruct inference".format(self.model_dir) + ) + instruct_text = self.frontend.text_normalize( + instruct_text, split=False, text_frontend=text_frontend + ) + for i in tqdm( + self.frontend.text_normalize( + tts_text, split=True, text_frontend=text_frontend + ) + ): + model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text) + start_time = time.time() + logging.info("synthesis text {}".format(i)) + for model_output in self.model.tts( + **model_input, stream=stream, speed=speed + ): + speech_len = model_output["tts_speech"].shape[1] / self.sample_rate + logging.info( + "yield speech len {}, rtf {}".format( + speech_len, (time.time() - start_time) / speech_len + ) + ) + yield model_output + start_time = time.time() + + def inference_vc( + self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0 + ): + model_input = self.frontend.frontend_vc( + source_speech_16k, prompt_speech_16k, self.sample_rate + ) + start_time = time.time() + for model_output in self.model.tts(**model_input, stream=stream, speed=speed): + speech_len = model_output["tts_speech"].shape[1] / self.sample_rate + logging.info( + "yield speech len {}, rtf {}".format( + speech_len, (time.time() - start_time) / speech_len + ) + ) + yield model_output + start_time = time.time() + + +class CosyVoice2(CosyVoice): + def __init__( + self, + model_dir, + load_jit=False, + load_trt=False, + load_vllm=False, + fp16=False, + trt_concurrent=1, + ): + self.instruct = True if "-Instruct" in model_dir else False + self.model_dir = model_dir + self.fp16 = fp16 + hyper_yaml_path = "{}/cosyvoice2.yaml".format(model_dir) + if not os.path.exists(hyper_yaml_path): + raise ValueError("{} not found!".format(hyper_yaml_path)) + with open(hyper_yaml_path, "r") as f: + configs = load_hyperpyyaml( + f, + overrides={ + "qwen_pretrain_path": os.path.join(model_dir, "CosyVoice-BlankEN") + }, + ) + # assert ( + # get_model_type(configs) == CosyVoice2Model + # ), "do not use {} for CosyVoice2 initialization!".format(model_dir) + self.frontend = CosyVoiceFrontEnd( + configs["get_tokenizer"], + configs["feat_extractor"], + "{}/campplus.onnx".format(model_dir), + "{}/speech_tokenizer_v2.onnx".format(model_dir), + "{}/spk2info.pt".format(model_dir), + configs["allowed_special"], + ) + self.sample_rate = configs["sample_rate"] + if (paddle.device.cuda.device_count() >= 1) is False and ( + load_jit is True or load_trt is True or fp16 is True + ): + load_jit, load_trt, fp16 = False, False, False + logging.warning("no cuda device, set load_jit/load_trt/fp16 to False") + self.model = CosyVoice2Model( + configs["llm"], configs["flow"], configs["hift"], fp16 + ) + self.model.load( + "{}/llm.pt".format(model_dir), + "{}/flow.pt".format(model_dir), + "{}/hift.pt".format(model_dir), + ) + if load_vllm: + self.model.load_vllm("{}/vllm".format(model_dir)) + if load_jit: + self.model.load_jit( + "{}/flow.encoder.{}.zip".format( + model_dir, "fp16" if self.fp16 is True else "fp32" + ) + ) + if load_trt: + self.model.load_trt( + "{}/flow.decoder.estimator.{}.mygpu.plan".format( + model_dir, "fp16" if self.fp16 is True else "fp32" + ), + "{}/flow.decoder.estimator.fp32.onnx".format(model_dir), + trt_concurrent, + self.fp16, + ) + del configs + + def inference_instruct(self, *args, **kwargs): + raise NotImplementedError( + "inference_instruct is not implemented for CosyVoice2!" + ) + + def inference_instruct2( + self, + tts_text, + instruct_text, + prompt_speech_16k, + zero_shot_spk_id="", + stream=False, + speed=1.0, + text_frontend=True, + ): + assert isinstance( + self.model, CosyVoice2Model + ), "inference_instruct2 is only implemented for CosyVoice2!" + for i in tqdm( + self.frontend.text_normalize( + tts_text, split=True, text_frontend=text_frontend + ) + ): + model_input = self.frontend.frontend_instruct2( + i, instruct_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id + ) + start_time = time.time() + logging.info("synthesis text {}".format(i)) + for model_output in self.model.tts( + **model_input, stream=stream, speed=speed + ): + speech_len = model_output["tts_speech"].shape[1] / self.sample_rate + logging.info( + "yield speech len {}, rtf {}".format( + speech_len, (time.time() - start_time) / speech_len + ) + ) + yield model_output + start_time = time.time() diff --git a/paddlespeech/t2s/models/CosyVoice/flow.py b/paddlespeech/t2s/models/CosyVoice/flow.py new file mode 100644 index 0000000000..f594cb18b3 --- /dev/null +++ b/paddlespeech/t2s/models/CosyVoice/flow.py @@ -0,0 +1,253 @@ +import math +from typing import Any +from typing import Dict +from typing import List + +import paddle +from paddle import nn +from paddle.nn import functional as F + +class Decoder(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + channels=(256, 256), + dropout=0.05, + attention_head_dim=64, + n_blocks=1, + num_mid_blocks=2, + num_heads=4, + act_fn="snake", + down_block_type="transformer", + mid_block_type="transformer", + up_block_type="transformer", + ): + super().__init__() + channels = tuple(channels) + self.in_channels = in_channels + self.out_channels = out_channels + + self.time_embeddings = SinusoidalPosEmb(in_channels) + time_embed_dim = channels[0] * 4 + self.time_mlp = TimestepEmbedding( + in_channels=in_channels, + time_embed_dim=time_embed_dim, + act_fn="silu", + ) + + self.down_blocks = nn.ModuleList([]) + self.mid_blocks = nn.ModuleList([]) + self.up_blocks = nn.ModuleList([]) + + output_channel = in_channels + for i in range(len(channels)): # pylint: disable=consider-using-enumerate + input_channel = output_channel + output_channel = channels[i] + is_last = i == len(channels) - 1 + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + transformer_blocks = nn.ModuleList( + [ + self.get_block( + down_block_type, + output_channel, + attention_head_dim, + num_heads, + dropout, + act_fn, + ) + for _ in range(n_blocks) + ] + ) + downsample = ( + Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1) + ) + + self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample])) + + for i in range(num_mid_blocks): + input_channel = channels[-1] + out_channels = channels[-1] + + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + + transformer_blocks = nn.ModuleList( + [ + self.get_block( + mid_block_type, + output_channel, + attention_head_dim, + num_heads, + dropout, + act_fn, + ) + for _ in range(n_blocks) + ] + ) + + self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks])) + + channels = channels[::-1] + (channels[0],) + for i in range(len(channels) - 1): + input_channel = channels[i] + output_channel = channels[i + 1] + is_last = i == len(channels) - 2 + + resnet = ResnetBlock1D( + dim=2 * input_channel, + dim_out=output_channel, + time_emb_dim=time_embed_dim, + ) + transformer_blocks = nn.ModuleList( + [ + self.get_block( + up_block_type, + output_channel, + attention_head_dim, + num_heads, + dropout, + act_fn, + ) + for _ in range(n_blocks) + ] + ) + upsample = ( + Upsample1D(output_channel, use_conv_transpose=True) + if not is_last + else nn.Conv1d(output_channel, output_channel, 3, padding=1) + ) + + self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample])) + + self.final_block = Block1D(channels[-1], channels[-1]) + self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1) + + self.initialize_weights() + # nn.init.normal_(self.final_proj.weight) + + @staticmethod + def get_block(block_type, dim, attention_head_dim, num_heads, dropout, act_fn): + if block_type == "conformer": + block = ConformerWrapper( + dim=dim, + dim_head=attention_head_dim, + heads=num_heads, + ff_mult=1, + conv_expansion_factor=2, + ff_dropout=dropout, + attn_dropout=dropout, + conv_dropout=dropout, + conv_kernel_size=31, + ) + elif block_type == "transformer": + block = BasicTransformerBlock( + dim=dim, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + else: + raise ValueError(f"Unknown block type {block_type}") + + return block + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_(m.weight, nonlinearity="relu") + + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + elif isinstance(m, nn.GroupNorm): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + elif isinstance(m, nn.Linear): + nn.init.kaiming_normal_(m.weight, nonlinearity="relu") + + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x, mask, mu, t, spks=None, cond=None): + """Forward pass of the UNet1DConditional model. + + Args: + x (torch.Tensor): shape (batch_size, in_channels, time) + mask (_type_): shape (batch_size, 1, time) + t (_type_): shape (batch_size) + spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None. + cond (_type_, optional): placeholder for future use. Defaults to None. + + Raises: + ValueError: _description_ + ValueError: _description_ + + Returns: + _type_: _description_ + """ + + t = self.time_embeddings(t) + t = self.time_mlp(t) + + x = pack([x, mu], "b * t")[0] + + if spks is not None: + spks = repeat(spks, "b c -> b c t", t=x.shape[-1]) + x = pack([x, spks], "b * t")[0] + + hiddens = [] + masks = [mask] + for resnet, transformer_blocks, downsample in self.down_blocks: + mask_down = masks[-1] + x = resnet(x, mask_down, t) + x = rearrange(x, "b c t -> b t c") + mask_down = rearrange(mask_down, "b 1 t -> b t") + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=mask_down, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t") + mask_down = rearrange(mask_down, "b t -> b 1 t") + hiddens.append(x) # Save hidden states for skip connections + x = downsample(x * mask_down) + masks.append(mask_down[:, :, ::2]) + + masks = masks[:-1] + mask_mid = masks[-1] + + for resnet, transformer_blocks in self.mid_blocks: + x = resnet(x, mask_mid, t) + x = rearrange(x, "b c t -> b t c") + mask_mid = rearrange(mask_mid, "b 1 t -> b t") + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=mask_mid, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t") + mask_mid = rearrange(mask_mid, "b t -> b 1 t") + + for resnet, transformer_blocks, upsample in self.up_blocks: + mask_up = masks.pop() + x = resnet(pack([x, hiddens.pop()], "b * t")[0], mask_up, t) + x = rearrange(x, "b c t -> b t c") + mask_up = rearrange(mask_up, "b 1 t -> b t") + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=mask_up, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t") + mask_up = rearrange(mask_up, "b t -> b 1 t") + x = upsample(x * mask_up) + + x = self.final_block(x, mask_up) + output = self.final_proj(x * mask_up) + + return output * mask diff --git a/paddlespeech/t2s/models/CosyVoice/frontend.py b/paddlespeech/t2s/models/CosyVoice/frontend.py new file mode 100644 index 0000000000..4fcbf6f172 --- /dev/null +++ b/paddlespeech/t2s/models/CosyVoice/frontend.py @@ -0,0 +1,448 @@ +import json +import os +import re +from functools import partial +from typing import Callable, Generator + +import inflect +import numpy as np +import onnxruntime +import paddle +import paddlespeech +import whisper +import logging +try: + import ttsfrd + + use_ttsfrd = True +except ImportError: + print("failed to import ttsfrd, use wetext instead") + from wetext import Normalizer as EnNormalizer + from wetext import Normalizer as ZhNormalizer + + use_ttsfrd = False +# split paragrah logic: +# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len +# 2. cal sentence len according to lang +# 3. split sentence according to puncatation +def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False): + def calc_utt_length(_text: str): + if lang == "zh": + return len(_text) + else: + return len(tokenize(_text)) + + def should_merge(_text: str): + if lang == "zh": + return len(_text) < merge_len + else: + return len(tokenize(_text)) < merge_len + + if lang == "zh": + pounc = ['。', '?', '!', ';', ':', '、', '.', '?', '!', ';'] + else: + pounc = ['.', '?', '!', ';', ':'] + if comma_split: + pounc.extend([',', ',']) + + if text[-1] not in pounc: + if lang == "zh": + text += "。" + else: + text += "." + + st = 0 + utts = [] + for i, c in enumerate(text): + if c in pounc: + if len(text[st: i]) > 0: + utts.append(text[st: i] + c) + if i + 1 < len(text) and text[i + 1] in ['"', '”']: + tmp = utts.pop(-1) + utts.append(tmp + text[i + 1]) + st = i + 2 + else: + st = i + 1 + + final_utts = [] + cur_utt = "" + for utt in utts: + if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n: + final_utts.append(cur_utt) + cur_utt = "" + cur_utt = cur_utt + utt + if len(cur_utt) > 0: + if should_merge(cur_utt) and len(final_utts) != 0: + final_utts[-1] = final_utts[-1] + cur_utt + else: + final_utts.append(cur_utt) + + return final_utts + +# spell Arabic numerals +def spell_out_number(text: str, inflect_parser): + new_text = [] + st = None + for i, c in enumerate(text): + if not c.isdigit(): + if st is not None: + num_str = inflect_parser.number_to_words(text[st: i]) + new_text.append(num_str) + st = None + new_text.append(c) + else: + if st is None: + st = i + if st is not None and st < len(text): + num_str = inflect_parser.number_to_words(text[st:]) + new_text.append(num_str) + return ''.join(new_text) + +# replace special symbol +def replace_corner_mark(text): + text = text.replace('²', '平方') + text = text.replace('³', '立方') + return text + +# remove blank between chinese character +def replace_blank(text: str): + out_str = [] + for i, c in enumerate(text): + if c == " ": + if ((text[i + 1].isascii() and text[i + 1] != " ") and + (text[i - 1].isascii() and text[i - 1] != " ")): + out_str.append(c) + else: + out_str.append(c) + return "".join(out_str) +def is_only_punctuation(text): + # Regular expression: Match strings that consist only of punctuation marks or are empty. + punctuation_pattern = r'^[\p{P}\p{S}]*$' + return bool(regex.fullmatch(punctuation_pattern, text)) + +# remove meaningless symbol +def remove_bracket(text): + text = text.replace('(', '').replace(')', '') + text = text.replace('【', '').replace('】', '') + text = text.replace('`', '').replace('`', '') + text = text.replace("——", " ") + return text +class CosyVoiceFrontEnd: + def __init__( + self, + get_tokenizer: Callable, + feat_extractor: Callable, + campplus_model: str, + speech_tokenizer_model: str, + spk2info: str = "", + allowed_special: str = "all", + ): + self.tokenizer = get_tokenizer() + self.feat_extractor = feat_extractor + self.device = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace() + option = onnxruntime.SessionOptions() + option.graph_optimization_level = ( + onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + ) + option.intra_op_num_threads = 1 + self.campplus_session = onnxruntime.InferenceSession( + campplus_model, sess_options=option, providers=["CPUExecutionProvider"] + ) + self.speech_tokenizer_session = onnxruntime.InferenceSession( + speech_tokenizer_model, + sess_options=option, + providers=[ + "CUDAExecutionProvider" + if paddle.device.cuda.device_count() >= 1 + else "CPUExecutionProvider" + ], + ) + if os.path.exists(spk2info): + self.spk2info = paddle.load(path=str(spk2info)) + else: + self.spk2info = {} + self.allowed_special = allowed_special + self.use_ttsfrd = use_ttsfrd + if self.use_ttsfrd: + self.frd = ttsfrd.TtsFrontendEngine() + ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + assert ( + self.frd.initialize( + "{}/../../pretrained_models/CosyVoice-ttsfrd/resource".format( + ROOT_DIR + ) + ) + is True + ), "failed to initialize ttsfrd resource" + self.frd.set_lang_type("pinyinvg") + else: + self.zh_tn_model = ZhNormalizer(remove_erhua=False) + self.en_tn_model = EnNormalizer() + self.inflect_parser = inflect.engine() + + def _extract_text_token(self, text): + if isinstance(text, Generator): + logging.info( + "get tts_text generator, will return _extract_text_token_generator!" + ) + return self._extract_text_token_generator(text), paddle.tensor( + [0], dtype=paddle.int32 + ).to(self.device) + else: + text_token = self.tokenizer.encode( + text, allowed_special=self.allowed_special + ) + text_token = paddle.tensor([text_token], dtype=paddle.int32).to(self.device) + text_token_len = paddle.tensor( + [text_token.shape[1]], dtype=paddle.int32 + ).to(self.device) + return text_token, text_token_len + + def _extract_text_token_generator(self, text_generator): + for text in text_generator: + text_token, _ = self._extract_text_token(text) + for i in range(text_token.shape[1]): + yield text_token[:, i : i + 1] + + def _extract_speech_token(self, speech): + assert ( + speech.shape[1] / 16000 <= 30 + ), "do not support extract speech token for audio longer than 30s" + feat = whisper.log_mel_spectrogram(speech, n_mels=128) + speech_token = ( + self.speech_tokenizer_session.run( + None, + { + self.speech_tokenizer_session.get_inputs()[0] + .name: feat.detach() + .cpu() + .numpy(), + self.speech_tokenizer_session.get_inputs()[1].name: np.array( + [feat.shape[2]], dtype=np.int32 + ), + }, + )[0] + .flatten() + .tolist() + ) + speech_token = paddle.tensor([speech_token], dtype=paddle.int32).to(self.device) + speech_token_len = paddle.tensor( + [speech_token.shape[1]], dtype=paddle.int32 + ).to(self.device) + return speech_token, speech_token_len + + def _extract_spk_embedding(self, speech): + ##################>>>>>>>>>>>>>>>>>>> + feat = torchaudio.compliance.kaldi.fbank( + speech, num_mel_bins=80, dither=0, sample_frequency=16000 + ) + ##################>>>>>>>>>>>>>>>>>>> + feat = feat - feat.mean(dim=0, keepdim=True) + embedding = ( + self.campplus_session.run( + None, + { + self.campplus_session.get_inputs()[0] + .name: feat.unsqueeze(dim=0) + .cpu() + .numpy() + }, + )[0] + .flatten() + .tolist() + ) + embedding = paddle.tensor([embedding]).to(self.device) + return embedding + + def _extract_speech_feat(self, speech): + speech_feat = ( + self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device) + ) + speech_feat = speech_feat.unsqueeze(dim=0) + speech_feat_len = paddle.tensor([speech_feat.shape[1]], dtype=paddle.int32).to( + self.device + ) + return speech_feat, speech_feat_len + + def text_normalize(self, text, split=True, text_frontend=True): + if isinstance(text, Generator): + logging.info("get tts_text generator, will skip text_normalize!") + return [text] + if text_frontend is False or text == "": + return [text] if split is True else text + text = text.strip() + if self.use_ttsfrd: + texts = [ + i["text"] + for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"] + ] + text = "".join(texts) + elif contains_chinese(text): + text = self.zh_tn_model.normalize(text) + text = text.replace("\n", "") + text = replace_blank(text) + text = replace_corner_mark(text) + text = text.replace(".", "。") + text = text.replace(" - ", ",") + text = remove_bracket(text) + text = re.sub("[,,、]+$", "。", text) + texts = list( + split_paragraph( + text, + partial( + self.tokenizer.encode, allowed_special=self.allowed_special + ), + "zh", + token_max_n=80, + token_min_n=60, + merge_len=20, + comma_split=False, + ) + ) + else: + text = self.en_tn_model.normalize(text) + text = spell_out_number(text, self.inflect_parser) + texts = list( + split_paragraph( + text, + partial( + self.tokenizer.encode, allowed_special=self.allowed_special + ), + "en", + token_max_n=80, + token_min_n=60, + merge_len=20, + comma_split=False, + ) + ) + texts = [i for i in texts if not is_only_punctuation(i)] + return texts if split is True else text + + def frontend_sft(self, tts_text, spk_id): + tts_text_token, tts_text_token_len = self._extract_text_token(tts_text) + print("1" * 30) + print(self.spk2info.keys()) + embedding = self.spk2info[spk_id]["embedding"] + model_input = { + "text": tts_text_token, + "text_len": tts_text_token_len, + "llm_embedding": embedding, + "flow_embedding": embedding, + } + return model_input + + def frontend_zero_shot( + self, tts_text, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id + ): + tts_text_token, tts_text_token_len = self._extract_text_token(tts_text) + if zero_shot_spk_id == "": + prompt_text_token, prompt_text_token_len = self._extract_text_token( + prompt_text + ) + #>>>>>>>>>>>>>>>>>>> + prompt_speech_resample = torchaudio.transforms.Resample( + orig_freq=16000, new_freq=resample_rate + )(prompt_speech_16k) + #>>>>>>>>>>>>>>>>>>> + speech_feat, speech_feat_len = self._extract_speech_feat( + prompt_speech_resample + ) + speech_token, speech_token_len = self._extract_speech_token( + prompt_speech_16k + ) + if resample_rate == 24000: + token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1]) + speech_feat, speech_feat_len[:] = ( + speech_feat[:, : 2 * token_len], + 2 * token_len, + ) + speech_token, speech_token_len[:] = ( + speech_token[:, :token_len], + token_len, + ) + embedding = self._extract_spk_embedding(prompt_speech_16k) + model_input = { + "prompt_text": prompt_text_token, + "prompt_text_len": prompt_text_token_len, + "llm_prompt_speech_token": speech_token, + "llm_prompt_speech_token_len": speech_token_len, + "flow_prompt_speech_token": speech_token, + "flow_prompt_speech_token_len": speech_token_len, + "prompt_speech_feat": speech_feat, + "prompt_speech_feat_len": speech_feat_len, + "llm_embedding": embedding, + "flow_embedding": embedding, + } + else: + model_input = self.spk2info[zero_shot_spk_id] + model_input["text"] = tts_text_token + model_input["text_len"] = tts_text_token_len + return model_input + + def frontend_cross_lingual( + self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id + ): + model_input = self.frontend_zero_shot( + tts_text, "", prompt_speech_16k, resample_rate, zero_shot_spk_id + ) + del model_input["prompt_text"] + del model_input["prompt_text_len"] + del model_input["llm_prompt_speech_token"] + del model_input["llm_prompt_speech_token_len"] + return model_input + + def frontend_instruct(self, tts_text, spk_id, instruct_text): + model_input = self.frontend_sft(tts_text, spk_id) + del model_input["llm_embedding"] + instruct_text_token, instruct_text_token_len = self._extract_text_token( + instruct_text + "" + ) + model_input["prompt_text"] = instruct_text_token + model_input["prompt_text_len"] = instruct_text_token_len + return model_input + + def frontend_instruct2( + self, + tts_text, + instruct_text, + prompt_speech_16k, + resample_rate, + zero_shot_spk_id, + ): + model_input = self.frontend_zero_shot( + tts_text, + instruct_text + "<|endofprompt|>", + prompt_speech_16k, + resample_rate, + zero_shot_spk_id, + ) + del model_input["llm_prompt_speech_token"] + del model_input["llm_prompt_speech_token_len"] + return model_input + + def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate): + prompt_speech_token, prompt_speech_token_len = self._extract_speech_token( + prompt_speech_16k + ) + #>>>>>>>>>>>>>>>>>> + prompt_speech_resample = torchaudio.transforms.Resample( + orig_freq=16000, new_freq=resample_rate + )(prompt_speech_16k) + #>>>>>>>>>>>>>>>>>> + prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat( + prompt_speech_resample + ) + embedding = self._extract_spk_embedding(prompt_speech_16k) + source_speech_token, source_speech_token_len = self._extract_speech_token( + source_speech_16k + ) + model_input = { + "source_speech_token": source_speech_token, + "source_speech_token_len": source_speech_token_len, + "flow_prompt_speech_token": prompt_speech_token, + "flow_prompt_speech_token_len": prompt_speech_token_len, + "prompt_speech_feat": prompt_speech_feat, + "prompt_speech_feat_len": prompt_speech_feat_len, + "flow_embedding": embedding, + } + return model_input \ No newline at end of file diff --git a/paddlespeech/t2s/models/CosyVoice/llm.py b/paddlespeech/t2s/models/CosyVoice/llm.py new file mode 100644 index 0000000000..6c509d9ab3 --- /dev/null +++ b/paddlespeech/t2s/models/CosyVoice/llm.py @@ -0,0 +1,737 @@ +import queue +import random +import threading +import time +from typing import Callable, Dict, Generator, List, Optional +import logging +import paddle.nn.functional as F +import paddle +IGNORE_ID = -1 +# from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss +# from cosyvoice.utils.common import IGNORE_ID, th_accuracy +# from cosyvoice.utils.file_utils import logging +# from cosyvoice.utils.mask import make_pad_mask +import torch +LabelSmoothingLoss = None +def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1): + top_ids = nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k) + recent_tokens = paddle.to_tensor(decoded_tokens[-win_size:], dtype='int64') + rep_num = paddle.sum(recent_tokens.cpu() == top_ids.cpu()).cpu().item() + if rep_num >= win_size * tau_r: + top_ids = random_sampling(weighted_scores, decoded_tokens, sampling) + return top_ids + + +def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25): + softmax_scores = paddle.nn.functional.softmax(weighted_scores, axis=0) + sorted_indices = paddle.argsort(softmax_scores, axis=0, descending=True) + sorted_probs = paddle.gather(softmax_scores, sorted_indices, axis=0) + + prob_list = [] + indices_list = [] + cum_prob = 0.0 + + for i in range(len(sorted_indices)): + if cum_prob < top_p and len(prob_list) < top_k: + cum_prob += sorted_probs[i].item() + prob_list.append(sorted_probs[i]) + indices_list.append(sorted_indices[i]) + else: + break + + prob_tensor = paddle.to_tensor(prob_list, dtype=weighted_scores.dtype) + indices_tensor = paddle.to_tensor(indices_list, dtype='int64') + top_ids = indices_tensor[paddle.multinomial(prob_tensor, num_samples=1, replacement=True)] + + return top_ids + + +def random_sampling(weighted_scores, decoded_tokens, sampling): + probs = paddle.nn.functional.softmax(weighted_scores, axis=0) + top_ids = paddle.multinomial(probs, num_samples=1, replacement=True) + return top_ids +def make_pad_mask(lengths: paddle.Tensor, max_len: int = 0) -> paddle.Tensor: + batch_size = lengths.shape[0] + max_len = max_len if max_len > 0 else lengths.max().item() + seq_range = paddle.arange(0, max_len, dtype='int64') + seq_range_expand = seq_range.unsqueeze(0).expand([batch_size, max_len]) + seq_length_expand = lengths.unsqueeze(-1) + mask = seq_range_expand >= seq_length_expand + return mask + +def th_accuracy(pad_outputs: paddle.Tensor, pad_targets: paddle.Tensor, + ignore_label: int) -> paddle.Tensor: + pad_pred = pad_outputs.reshape((pad_targets.shape[0], pad_targets.shape[1], -1)).argmax(axis=2) + mask = pad_targets != ignore_label + numerator = paddle.sum((pad_pred[mask] == pad_targets[mask]).astype('float32')) + denominator = paddle.sum(mask.astype('float32')) + accuracy = numerator / denominator + + return accuracy.detach() +class TransformerLM(paddle.nn.Layer): + def __init__( + self, + text_encoder_input_size: int, + llm_input_size: int, + llm_output_size: int, + text_token_size: int, + speech_token_size: int, + text_encoder: paddle.nn.Layer, + llm: paddle.nn.Layer, + sampling: Callable, + length_normalized_loss: bool = True, + lsm_weight: float = 0.0, + spk_embed_dim: int = 192, + ): + super().__init__() + self.llm_input_size = llm_input_size + self.speech_token_size = speech_token_size + self.text_embedding = paddle.nn.Embedding( + text_token_size, text_encoder_input_size + ) + self.text_encoder = text_encoder + self.text_encoder_affine_layer = paddle.nn.Linear( + in_features=self.text_encoder.output_size(), out_features=llm_input_size + ) + self.sos_eos = 0 + self.task_id = 1 + self.llm_embedding = paddle.nn.Embedding(2, llm_input_size) + self.llm = llm + self.llm_decoder = paddle.nn.Linear( + in_features=llm_output_size, out_features=speech_token_size + 1 + ) + + self.criterion_ce = LabelSmoothingLoss( + size=speech_token_size + 1, + padding_idx=IGNORE_ID, + smoothing=lsm_weight, + normalize_length=length_normalized_loss, + ) + self.speech_embedding = paddle.nn.Embedding(speech_token_size, llm_input_size) + self.spk_embed_affine_layer = paddle.nn.Linear( + in_features=spk_embed_dim, out_features=llm_input_size + ) + self.sampling = sampling + + def encode(self, text: paddle.Tensor, text_lengths: paddle.Tensor): + encoder_out, encoder_mask = self.text_encoder( + text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1 + ) + encoder_out_lens = encoder_mask.squeeze(1).sum(1) + encoder_out = self.text_encoder_affine_layer(encoder_out) + return encoder_out, encoder_out_lens + + def pad_unpad_sequence( + self, + sos_eos_emb, + embedding, + text_token, + text_token_len, + task_id_emb, + speech_token, + speech_token_len, + ): + + text_token = paddle.static.nn.sequence_unpad( + text_token, text_token_len.cpu() + ) + speech_token = paddle.static.nn.sequence_unpad( + speech_token, speech_token_len.cpu() + ) + lm_input = [ + paddle.cat( + [ + sos_eos_emb.squeeze(dim=0), + embedding[i], + text_token[i], + task_id_emb.squeeze(dim=0), + speech_token[i], + ], + dim=0, + ) + for i in range(len(text_token)) + ] + lm_input_len = paddle.tensor([i.size(0) for i in lm_input], dtype=paddle.int32) + lm_input = paddle.static.nn.sequence_unpad( + lm_input, batch_first=True, padding_value=IGNORE_ID + ) + return lm_input, lm_input_len + + def forward( + self, batch: dict, device: torch.device + ) -> Dict[str, Optional[paddle.Tensor]]: + """ + Args: + text: (B, L, D) + text_lengths: (B,) + audio: (B, T, N) or (B, T) + audio_lengths: (B,) + """ + text_token = batch["text_token"].to(device) + text_token_len = batch["text_token_len"].to(device) + speech_token = batch["speech_token"].to(device) + speech_token_len = batch["speech_token_len"].to(device) + embedding = batch["embedding"].to(device) + lm_target = [ + paddle.tensor( + [IGNORE_ID] * (2 + text_token_len[i]) + + speech_token[i, : speech_token_len[i]].tolist() + + [self.speech_token_size] + ) + for i in range(text_token.size(0)) + ] + lm_target = torch.nn.utils.rnn.pad_sequence( + lm_target, batch_first=True, padding_value=IGNORE_ID + ).to(device) + text_token = self.text_embedding(text_token) + text_token, text_token_len = self.encode(text_token, text_token_len) + embedding = paddle.nn.functional.normalize(x=embedding, axis=1) + embedding = self.spk_embed_affine_layer(embedding) + embedding = embedding.unsqueeze(1) + sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1) + task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1) + speech_token = self.speech_embedding(speech_token) + lm_input, lm_input_len = self.pad_unpad_sequence( + sos_eos_emb, + embedding, + text_token, + text_token_len, + task_id_emb, + speech_token, + speech_token_len, + ) + lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device)) + logits = self.llm_decoder(lm_output) + loss = self.criterion_ce(logits, lm_target) + acc = th_accuracy( + logits.view(-1, self.speech_token_size + 1), + lm_target, + ignore_label=IGNORE_ID, + ) + return {"loss": loss, "acc": acc} + + def sampling_ids( + self, + weighted_scores: paddle.Tensor, + decoded_tokens: List, + sampling: int, + ignore_eos: bool = True, + ): + num_trials, max_trials = 0, 100 + while True: + top_ids = self.sampling(weighted_scores, decoded_tokens, sampling) + if not ignore_eos or self.speech_token_size not in top_ids: + break + num_trials += 1 + if num_trials > max_trials: + raise RuntimeError( + "sampling reaches max_trials {} and still get eos when ignore_eos is True, check your input!".format( + max_trials + ) + ) + return top_ids + + @paddle.no_grad() + def inference( + self, + text: paddle.Tensor, + text_len: paddle.Tensor, + prompt_text: paddle.Tensor, + prompt_text_len: paddle.Tensor, + prompt_speech_token: paddle.Tensor, + prompt_speech_token_len: paddle.Tensor, + embedding: paddle.Tensor, + sampling: int = 25, + max_token_text_ratio: float = 20, + min_token_text_ratio: float = 2, + uuid: str = "", + ) -> Generator[paddle.Tensor, None, None]: + device = text.place + text = paddle.cat([prompt_text, text], dim=1) + text_len += prompt_text_len + text = self.text_embedding(text) + text, text_len = self.encode(text, text_len) + if embedding.shape[0] != 0: + embedding = paddle.nn.functional.normalize(x=embedding, axis=1) + embedding = self.spk_embed_affine_layer(embedding) + embedding = embedding.unsqueeze(dim=1) + else: + embedding = ( + paddle.zeros(1, 0, self.llm_input_size, dtype=text.dtype) + .to(device) + .to(text.dtype) + ) + sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1) + task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1) + if prompt_speech_token_len != 0: + prompt_speech_token_emb = self.speech_embedding(prompt_speech_token) + else: + prompt_speech_token_emb = paddle.zeros( + 1, 0, self.llm_input_size, dtype=text.dtype + ).to(device) + lm_input = paddle.cat( + [sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1 + ) + min_len = int((text_len - prompt_text_len) * min_token_text_ratio) + max_len = int((text_len - prompt_text_len) * max_token_text_ratio) + out_tokens = [] + offset = 0 + att_cache, cnn_cache = paddle.zeros( + (0, 0, 0, 0), device=lm_input.place + ), paddle.zeros((0, 0, 0, 0), device=lm_input.place) + for i in range(max_len): + y_pred, att_cache, cnn_cache = self.llm.forward_chunk( + lm_input, + offset=offset, + required_cache_size=-1, + att_cache=att_cache, + cnn_cache=cnn_cache, + att_mask=paddle.tril( + paddle.ones( + (1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.place + ) + ).to(paddle.bool), + ) + logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) + if i == 0: + logp[:, self.speech_token_size] = -float("inf") + top_ids = self.sampling_ids( + logp.squeeze(dim=0), + out_tokens, + sampling, + ignore_eos=True if i < min_len else False, + ).item() + if top_ids == self.speech_token_size: + break + yield top_ids + out_tokens.append(top_ids) + offset += lm_input.size(1) + lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) + + +class Qwen2Encoder(paddle.nn.Layer): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, xs: paddle.Tensor, xs_lens: paddle.Tensor): + T = xs.size(1) + masks = ~make_pad_mask(xs_lens, T) + outs = self.model( + inputs_embeds=xs, + attention_mask=masks, + output_hidden_states=True, + return_dict=True, + ) + return outs.hidden_states[-1], masks.unsqueeze(1) + + def forward_one_step(self, xs, masks, cache=None): + input_masks = masks[:, -1, :] + outs = self.model( + inputs_embeds=xs, + attention_mask=input_masks, + output_hidden_states=True, + return_dict=True, + use_cache=True, + past_key_values=cache, + ) + xs = outs.hidden_states[-1] + new_cache = outs.past_key_values + xs = paddle.cast(xs, dtype = 'float32') + return xs, new_cache + + +class Qwen2LM(TransformerLM): + def __init__( + self, + llm_input_size: int, + llm_output_size: int, + speech_token_size: int, + llm: paddle.nn.Layer, + sampling: Callable, + length_normalized_loss: bool = True, + lsm_weight: float = 0.0, + mix_ratio: List[int] = [5, 15], + ): + paddle.nn.Layer.__init__(self) + self.llm_input_size = llm_input_size + self.llm_output_size = llm_output_size + self.speech_token_size = speech_token_size + self.sos_eos = 0 + self.task_id = 1 + self.fill_token = 2 + self.llm_embedding = paddle.nn.Embedding(2, llm_input_size) + self.llm = llm + self.llm_decoder = paddle.nn.Linear( + in_features=llm_output_size, out_features=speech_token_size + 3 + ) + # self.llm_decoder.weight = paddle.create_parameter( + # shape=self.llm_decoder.weight.shape, + # dtype='bfloat16', + # default_initializer=paddle.nn.initializer.Assign(self.llm_decoder.weight.astype('bfloat16')) + # ) + # if self.llm_decoder.bias is not None: + # self.llm_decoder.bias = paddle.create_parameter( + # shape=self.llm_decoder.bias.shape, + # dtype='bfloat16', + # default_initializer=paddle.nn.initializer.Assign(self.llm_decoder.bias.astype('bfloat16')) + # ) + # self.criterion_ce = LabelSmoothingLoss( + # size=speech_token_size + 3, + # padding_idx=IGNORE_ID, + # smoothing=lsm_weight, + # normalize_length=length_normalized_loss, + # ) + self.speech_embedding = paddle.nn.Embedding( + speech_token_size + 3, llm_input_size + ) + self.sampling = sampling + self.mix_ratio = mix_ratio + self.stop_token_ids = [(speech_token_size + i) for i in range(3)] + self.vllm_output_queue = {} + + # def prepare_lm_input_target( + # self, + # text_token, + # text_token_emb, + # text_token_len, + # speech_token, + # speech_token_emb, + # speech_token_len, + # ): + # lm_target, lm_input = [], [] + # text_token = torch.nn.utils.rnn.unpad_sequence( + # text_token, text_token_len.cpu(), batch_first=True + # ) + # speech_token = torch.nn.utils.rnn.unpad_sequence( + # speech_token, speech_token_len.cpu(), batch_first=True + # ) + # text_token_emb = torch.nn.utils.rnn.unpad_sequence( + # text_token_emb, text_token_len.cpu(), batch_first=True + # ) + # speech_token_emb = torch.nn.utils.rnn.unpad_sequence( + # speech_token_emb, speech_token_len.cpu(), batch_first=True + # ) + # for i in range(len(text_token)): + # if ( + # random.random() < 0.5 + # and speech_token_len[i] / text_token_len[i] + # > self.mix_ratio[1] / self.mix_ratio[0] + # ): + # this_lm_target, this_lm_input = [], [] + # this_lm_target.append(IGNORE_ID) + # this_lm_input.append( + # self.llm_embedding.weight[self.sos_eos].reshape(1, -1) + # ) + # for j in range( + # ((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item() + # ): + # this_text_token = text_token[i][ + # j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0] + # ].tolist() + # this_speech_token = speech_token[i][ + # j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1] + # ].tolist() + # if len(this_text_token) == self.mix_ratio[0]: + # assert len(this_speech_token) == self.mix_ratio[1] + # this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1) + # this_lm_target += this_speech_token + # this_lm_target.append(self.speech_token_size + 2) + # this_lm_input.append( + # text_token_emb[i][ + # j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0] + # ] + # ) + # this_lm_input.append( + # speech_token_emb[i][ + # j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1] + # ] + # ) + # else: + # this_lm_target += [-1] * len(this_text_token) + # this_lm_target += speech_token[i][ + # j * self.mix_ratio[1] : + # ].tolist() + # this_lm_target.append(self.speech_token_size) + # this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0] :]) + # this_lm_input.append( + # self.llm_embedding.weight[self.task_id].reshape(1, -1) + # ) + # this_lm_input.append( + # speech_token_emb[i][j * self.mix_ratio[1] :] + # ) + # this_lm_target, this_lm_input = paddle.tensor( + # this_lm_target + # ), paddle.cat(this_lm_input, dim=0) + # else: + # this_lm_target = paddle.tensor( + # [IGNORE_ID] * (1 + text_token_len[i]) + # + speech_token[i].tolist() + # + [self.speech_token_size] + # ) + # this_lm_input = paddle.cat( + # [ + # self.llm_embedding.weight[self.sos_eos].reshape(1, -1), + # text_token_emb[i], + # self.llm_embedding.weight[self.task_id].reshape(1, -1), + # speech_token_emb[i], + # ], + # dim=0, + # ) + # lm_target.append(this_lm_target) + # lm_input.append(this_lm_input) + # lm_input_len = paddle.tensor([i.size(0) for i in lm_input], dtype=paddle.int32) + # lm_input = torch.nn.utils.rnn.pad_sequence( + # lm_input, batch_first=True, padding_value=IGNORE_ID + # ) + # lm_target = torch.nn.utils.rnn.pad_sequence( + # lm_target, batch_first=True, padding_value=IGNORE_ID + # ) + # return lm_target, lm_input, lm_input_len + + @paddle.no_grad() + def inference( + self, + text: paddle.Tensor, + text_len: paddle.Tensor, + prompt_text: paddle.Tensor, + prompt_text_len: paddle.Tensor, + prompt_speech_token: paddle.Tensor, + prompt_speech_token_len: paddle.Tensor, + embedding: paddle.Tensor, + sampling: int = 25, + max_token_text_ratio: float = 20, + min_token_text_ratio: float = 2, + uuid: str = "", + ) -> Generator[paddle.Tensor, None, None]: + device = text.place + text = paddle.cat([prompt_text, text], dim=1) + text_len += prompt_text_len + text = self.llm.model.qwen2.embed_tokens(text) + sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape([1, 1, -1]) + task_id_emb = self.llm_embedding.weight[self.task_id].reshape([1, 1, -1]) + if prompt_speech_token_len != 0: + prompt_speech_token_emb = self.speech_embedding(prompt_speech_token) + else: + prompt_speech_token_emb = paddle.zeros( + 1, 0, self.llm_input_size, dtype=text.dtype + ).to(device) + text = paddle.cast(text,dtype = 'float32') + lm_input = paddle.cat( + [sos_eos_emb, text, task_id_emb, prompt_speech_token_emb], dim=1 + ) + min_len = int((text_len - prompt_text_len) * min_token_text_ratio) + max_len = int((text_len - prompt_text_len) * max_token_text_ratio) + for token in self.inference_wrapper(lm_input, sampling, min_len, max_len, uuid): + yield token + + @paddle.no_grad() + def inference_wrapper(self, lm_input, sampling, min_len, max_len, uuid): + if hasattr(self, "vllm"): + from vllm import RequestOutput, SamplingParams + + sampling_params = SamplingParams( + top_k=sampling, + stop_token_ids=self.stop_token_ids, + min_tokens=min_len, + max_tokens=max_len, + ) + with self.lock: + self.vllm.add_request( + uuid, + { + "prompt_embeds": lm_input.squeeze(0) + .to(paddle.bfloat16) + .to(lm_input.place) + }, + sampling_params, + ) + self.vllm_output_queue[uuid] = queue.Queue() + out_tokens = [] + while True: + with self.lock: + if self.vllm_output_queue[uuid].empty() is True: + request_outputs: List[RequestOutput] = self.vllm.step() + for request_output in request_outputs: + top_ids = list(request_output.outputs[0].token_ids)[-1] + self.vllm_output_queue[request_output.request_id].put( + top_ids + ) + if self.vllm_output_queue[uuid].empty() is False: + top_ids = self.vllm_output_queue[uuid].get() + if top_ids in self.stop_token_ids: + break + yield top_ids + out_tokens.append(top_ids) + if len(out_tokens) == max_len: + break + time.sleep(0.001) + with self.lock: + self.vllm_output_queue.pop(uuid) + else: + out_tokens = [] + cache = None + for i in range(max_len): + y_pred, cache = self.llm.forward_one_step( + lm_input, + masks=paddle.tril( + paddle.ones( + (1, lm_input.shape[1], lm_input.shape[1]), + ) + ).to(paddle.bool), + cache=cache, + ) + logp = F.log_softmax(self.llm_decoder(y_pred[:, -1]), axis = -1) + top_ids = self.sampling_ids( + logp.squeeze(axis=0), + out_tokens, + sampling, + ignore_eos=True if i < min_len else False, + ).item() + if top_ids == self.speech_token_size: + break + if top_ids > self.speech_token_size: + continue + yield top_ids + out_tokens.append(top_ids) + lm_input = self.speech_embedding.weight[top_ids].reshape([1, 1, -1]) + + @paddle.no_grad() + def inference_bistream( + self, + text: Generator, + prompt_text: paddle.Tensor, + prompt_text_len: paddle.Tensor, + prompt_speech_token: paddle.Tensor, + prompt_speech_token_len: paddle.Tensor, + embedding: paddle.Tensor, + sampling: int = 25, + max_token_text_ratio: float = 20, + min_token_text_ratio: float = 2, + ) -> Generator[paddle.Tensor, None, None]: + device = prompt_text.place + sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1) + task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1) + if prompt_speech_token_len != 0: + prompt_speech_token_emb = self.speech_embedding(prompt_speech_token) + else: + prompt_speech_token_emb = paddle.zeros( + 1, 0, self.llm_input_size, dtype=prompt_text.dtype + ).to(device) + lm_input = paddle.cat([sos_eos_emb], dim=1) + out_tokens = [] + cache = None + text_cache = self.llm.model.model.embed_tokens(prompt_text) + next_fill_index = -1 + for this_text in text: + text_cache = paddle.cat( + [text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1 + ) + while prompt_speech_token_emb.size(1) != 0: + if text_cache.size(1) >= self.mix_ratio[0]: + lm_input_text, lm_input_speech = ( + text_cache[:, : self.mix_ratio[0]], + prompt_speech_token_emb[:, : self.mix_ratio[1]], + ) + logging.info( + "append {} text token {} speech token".format( + lm_input_text.size(1), lm_input_speech.size(1) + ) + ) + lm_input = paddle.cat( + [lm_input, lm_input_text, lm_input_speech], dim=1 + ) + text_cache, prompt_speech_token_emb = ( + text_cache[:, self.mix_ratio[0] :], + prompt_speech_token_emb[:, self.mix_ratio[1] :], + ) + else: + logging.info("not enough text token to decode, wait for more") + break + if prompt_speech_token_emb.size(1) == 0: + if ( + len(out_tokens) != 0 + and out_tokens[-1] == self.speech_token_size + 2 + or len(out_tokens) == 0 + and lm_input.size(1) == 1 + ): + logging.info("get fill token, need to append more text token") + if text_cache.size(1) >= self.mix_ratio[0]: + lm_input_text = text_cache[:, : self.mix_ratio[0]] + logging.info( + "append {} text token".format(lm_input_text.size(1)) + ) + if ( + len(out_tokens) != 0 + and out_tokens[-1] == self.speech_token_size + 2 + ): + lm_input = lm_input_text + else: + lm_input = paddle.cat([lm_input, lm_input_text], dim=1) + text_cache = text_cache[:, self.mix_ratio[0] :] + else: + logging.info("not enough text token to decode, wait for more") + continue + while True: + seq_len = ( + lm_input.shape[1] + if cache is None + else lm_input.shape[1] + cache[0][0].size(2) + ) + y_pred, cache = self.llm.forward_one_step( + lm_input, + masks=paddle.tril( + paddle.ones((1, seq_len, seq_len), device=lm_input.place) + ).to(paddle.bool), + cache=cache, + ) + logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) + if next_fill_index != -1 and len(out_tokens) == next_fill_index: + top_ids = self.speech_token_size + 2 + next_fill_index += self.mix_ratio[1] + 1 + else: + top_ids = self.sampling_ids( + logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True + ).item() + if top_ids == self.speech_token_size + 2: + next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1 + logging.info( + "fill_token index {} next fill_token index {}".format( + len(out_tokens), next_fill_index + ) + ) + out_tokens.append(top_ids) + if top_ids >= self.speech_token_size: + if top_ids == self.speech_token_size + 2: + break + else: + raise ValueError("should not get token {}".format(top_ids)) + yield top_ids + lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) + lm_input = paddle.cat([lm_input, text_cache, task_id_emb], dim=1) + logging.info("no more text token, decode until met eos") + while True: + seq_len = ( + lm_input.shape[1] + if cache is None + else lm_input.shape[1] + cache[0][0].size(2) + ) + y_pred, cache = self.llm.forward_one_step( + lm_input, + masks=paddle.tril( + paddle.ones((1, seq_len, seq_len), device=lm_input.place) + ).to(paddle.bool), + cache=cache, + ) + logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) + top_ids = self.sampling_ids( + logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=False + ).item() + out_tokens.append(top_ids) + if top_ids >= self.speech_token_size: + if top_ids == self.speech_token_size: + break + else: + raise ValueError("should not get token {}".format(top_ids)) + yield top_ids + lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) diff --git a/paddlespeech/t2s/models/CosyVoice/model.py b/paddlespeech/t2s/models/CosyVoice/model.py new file mode 100644 index 0000000000..aff8573bba --- /dev/null +++ b/paddlespeech/t2s/models/CosyVoice/model.py @@ -0,0 +1,597 @@ +import os +import threading +import time +import uuid +from contextlib import nullcontext +from typing import Generator + +import numpy as np +import paddle + +# from cosyvoice.utils.common import TrtContextWrapper, fade_in_out +# from cosyvoice.utils.file_utils import * +# from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm + + +class CosyVoiceModel: + def __init__( + self, + llm: paddle.nn.Layer, + flow: paddle.nn.Layer, + hift: paddle.nn.Layer, + fp16: bool = False, + ): + self.device = device2str( + "cuda" if paddle.device.cuda.device_count() >= 1 else "cpu" + ) + self.llm = llm + self.flow = flow + self.hift = hift + self.fp16 = fp16 + if self.fp16 is True: + self.llm.half() + self.flow.half() + self.token_min_hop_len = 2 * self.flow.input_frame_rate + self.token_max_hop_len = 4 * self.flow.input_frame_rate + self.token_overlap_len = 20 + self.mel_overlap_len = int( + self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256 + ) + self.mel_window = np.hamming(2 * self.mel_overlap_len) + self.mel_cache_len = 20 + self.source_cache_len = int(self.mel_cache_len * 256) + self.speech_window = np.hamming(2 * self.source_cache_len) + self.stream_scale_factor = 1 + assert ( + self.stream_scale_factor >= 1 + ), "stream_scale_factor should be greater than 1, change it according to your actual rtf" + self.llm_context = ( + paddle.device.stream_guard( + paddle.device.Stream(device=device2str(self.device)) + ) + if paddle.device.cuda.device_count() >= 1 + else nullcontext() + ) + self.lock = threading.Lock() + self.tts_speech_token_dict = {} + self.llm_end_dict = {} + self.mel_overlap_dict = {} + self.flow_cache_dict = {} + self.hift_cache_dict = {} + + def load(self, llm_model, flow_model, hift_model): + self.llm.set_state_dict(state_dict=paddle.load(path=str(llm_model))) + self.llm.to(self.device).eval() + self.flow.set_state_dict(state_dict=paddle.load(path=str(flow_model))) + self.flow.to(self.device).eval() + hift_state_dict = { + k.replace("generator.", ""): v + for k, v in paddle.load(path=str(hift_model)).items() + } + self.hift.set_state_dict(state_dict=hift_state_dict) + self.hift.to(self.device).eval() + + def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model): + llm_text_encoder = torch.jit.load( + llm_text_encoder_model, map_location=self.device + ) + self.llm.text_encoder = llm_text_encoder + llm_llm = torch.jit.load(llm_llm_model, map_location=self.device) + self.llm.llm = llm_llm + flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device) + self.flow.encoder = flow_encoder + + def load_trt( + self, + flow_decoder_estimator_model, + flow_decoder_onnx_model, + trt_concurrent, + fp16, + ): + assert paddle.device.cuda.device_count() >= 1, "tensorrt only supports gpu!" + if ( + not os.path.exists(flow_decoder_estimator_model) + or os.path.getsize(flow_decoder_estimator_model) == 0 + ): + convert_onnx_to_trt( + flow_decoder_estimator_model, + self.get_trt_kwargs(), + flow_decoder_onnx_model, + fp16, + ) + del self.flow.decoder.estimator + import tensorrt as trt + + with open(flow_decoder_estimator_model, "rb") as f: + estimator_engine = trt.Runtime( + trt.Logger(trt.Logger.INFO) + ).deserialize_cuda_engine(f.read()) + assert estimator_engine is not None, "failed to load trt {}".format( + flow_decoder_estimator_model + ) + self.flow.decoder.estimator = TrtContextWrapper( + estimator_engine, trt_concurrent=trt_concurrent, device=self.device + ) + + def get_trt_kwargs(self): + min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)] + opt_shape = [(2, 80, 500), (2, 1, 500), (2, 80, 500), (2, 80, 500)] + max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)] + input_names = ["x", "mask", "mu", "cond"] + return { + "min_shape": min_shape, + "opt_shape": opt_shape, + "max_shape": max_shape, + "input_names": input_names, + } + + def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): + with self.llm_context, paddle.amp.auto_cast( + enable=self.fp16 is True and hasattr(self.llm, "vllm") is False + ): + if isinstance(text, Generator): + assert isinstance(self, CosyVoice2Model) and not hasattr( + self.llm, "vllm" + ), "streaming input text is only implemented for CosyVoice2 and do not support vllm!" + for i in self.llm.inference_bistream( + text=text, + prompt_text=prompt_text.to(self.device), + prompt_text_len=paddle.tensor( + [prompt_text.shape[1]], dtype=paddle.int32 + ).to(self.device), + prompt_speech_token=llm_prompt_speech_token.to(self.device), + prompt_speech_token_len=paddle.tensor( + [llm_prompt_speech_token.shape[1]], dtype=paddle.int32 + ).to(self.device), + embedding=llm_embedding.to(self.device), + ): + self.tts_speech_token_dict[uuid].append(i) + else: + for i in self.llm.inference( + text=text.to(self.device), + text_len=paddle.tensor([text.shape[1]], dtype=paddle.int32).to( + self.device + ), + prompt_text=prompt_text.to(self.device), + prompt_text_len=paddle.tensor( + [prompt_text.shape[1]], dtype=paddle.int32 + ).to(self.device), + prompt_speech_token=llm_prompt_speech_token.to(self.device), + prompt_speech_token_len=paddle.tensor( + [llm_prompt_speech_token.shape[1]], dtype=paddle.int32 + ).to(self.device), + embedding=llm_embedding.to(self.device), + uuid=uuid, + ): + self.tts_speech_token_dict[uuid].append(i) + self.llm_end_dict[uuid] = True + + def vc_job(self, source_speech_token, uuid): + self.tts_speech_token_dict[uuid] = source_speech_token.flatten().tolist() + self.llm_end_dict[uuid] = True + + def token2wav( + self, + token, + prompt_token, + prompt_feat, + embedding, + uuid, + finalize=False, + speed=1.0, + ): + with paddle.amp.auto_cast(enable=self.fp16): + tts_mel, self.flow_cache_dict[uuid] = self.flow.inference( + token=token.to(self.device), + token_len=paddle.tensor([token.shape[1]], dtype=paddle.int32).to( + self.device + ), + prompt_token=prompt_token.to(self.device), + prompt_token_len=paddle.tensor( + [prompt_token.shape[1]], dtype=paddle.int32 + ).to(self.device), + prompt_feat=prompt_feat.to(self.device), + prompt_feat_len=paddle.tensor( + [prompt_feat.shape[1]], dtype=paddle.int32 + ).to(self.device), + embedding=embedding.to(self.device), + flow_cache=self.flow_cache_dict[uuid], + ) + if self.mel_overlap_dict[uuid].shape[2] != 0: + tts_mel = fade_in_out(tts_mel, self.mel_overlap_dict[uuid], self.mel_window) + if self.hift_cache_dict[uuid] is not None: + hift_cache_mel, hift_cache_source = ( + self.hift_cache_dict[uuid]["mel"], + self.hift_cache_dict[uuid]["source"], + ) + tts_mel = paddle.cat([hift_cache_mel, tts_mel], dim=2) + else: + hift_cache_source = paddle.zeros([1, 1, 0]) + if finalize is False: + self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len :] + tts_mel = tts_mel[:, :, : -self.mel_overlap_len] + tts_speech, tts_source = self.hift.inference( + speech_feat=tts_mel, cache_source=hift_cache_source + ) + if self.hift_cache_dict[uuid] is not None: + tts_speech = fade_in_out( + tts_speech, self.hift_cache_dict[uuid]["speech"], self.speech_window + ) + self.hift_cache_dict[uuid] = { + "mel": tts_mel[:, :, -self.mel_cache_len :], + "source": tts_source[:, :, -self.source_cache_len :], + "speech": tts_speech[:, -self.source_cache_len :], + } + tts_speech = tts_speech[:, : -self.source_cache_len] + else: + if speed != 1.0: + assert ( + self.hift_cache_dict[uuid] is None + ), "speed change only support non-stream inference mode" + tts_mel = paddle.nn.functional.interpolate( + x=tts_mel, size=int(tts_mel.shape[2] / speed), mode="linear" + ) + tts_speech, tts_source = self.hift.inference( + speech_feat=tts_mel, cache_source=hift_cache_source + ) + if self.hift_cache_dict[uuid] is not None: + tts_speech = fade_in_out( + tts_speech, self.hift_cache_dict[uuid]["speech"], self.speech_window + ) + return tts_speech + + def tts( + self, + text=paddle.zeros([1, 0], dtype=paddle.int32), + flow_embedding=paddle.zeros([0, 192]), + llm_embedding=paddle.zeros([0, 192]), + prompt_text=paddle.zeros([1, 0], dtype=paddle.int32), + llm_prompt_speech_token=paddle.zeros([1, 0], dtype=paddle.int32), + flow_prompt_speech_token=paddle.zeros([1, 0], dtype=paddle.int32), + prompt_speech_feat=paddle.zeros([1, 0, 80]), + source_speech_token=paddle.zeros([1, 0], dtype=paddle.int32), + stream=False, + speed=1.0, + **kwargs + ): + this_uuid = str(uuid.uuid1()) + with self.lock: + self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = ( + [], + False, + ) + self.hift_cache_dict[this_uuid] = None + self.mel_overlap_dict[this_uuid] = paddle.zeros([1, 80, 0]) + self.flow_cache_dict[this_uuid] = paddle.zeros([1, 80, 0, 2]) + if source_speech_token.shape[1] == 0: + p = threading.Thread( + target=self.llm_job, + args=( + text, + prompt_text, + llm_prompt_speech_token, + llm_embedding, + this_uuid, + ), + ) + else: + p = threading.Thread( + target=self.vc_job, args=(source_speech_token, this_uuid) + ) + """Not Support auto convert *.start, please judge whether it is Pytorch API and convert by yourself""" + p.start() + if stream is True: + token_hop_len = self.token_min_hop_len + while True: + time.sleep(0.1) + if ( + len(self.tts_speech_token_dict[this_uuid]) + >= token_hop_len + self.token_overlap_len + ): + this_tts_speech_token = paddle.tensor( + self.tts_speech_token_dict[this_uuid][ + : token_hop_len + self.token_overlap_len + ] + ).unsqueeze(dim=0) + this_tts_speech = self.token2wav( + token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, + prompt_feat=prompt_speech_feat, + embedding=flow_embedding, + uuid=this_uuid, + finalize=False, + ) + yield {"tts_speech": this_tts_speech.cpu()} + with self.lock: + self.tts_speech_token_dict[ + this_uuid + ] = self.tts_speech_token_dict[this_uuid][token_hop_len:] + token_hop_len = min( + self.token_max_hop_len, + int(token_hop_len * self.stream_scale_factor), + ) + if ( + self.llm_end_dict[this_uuid] is True + and len(self.tts_speech_token_dict[this_uuid]) + < token_hop_len + self.token_overlap_len + ): + break + p.join() + this_tts_speech_token = paddle.tensor( + self.tts_speech_token_dict[this_uuid] + ).unsqueeze(dim=0) + this_tts_speech = self.token2wav( + token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, + prompt_feat=prompt_speech_feat, + embedding=flow_embedding, + uuid=this_uuid, + finalize=True, + ) + yield {"tts_speech": this_tts_speech.cpu()} + else: + p.join() + this_tts_speech_token = paddle.tensor( + self.tts_speech_token_dict[this_uuid] + ).unsqueeze(dim=0) + this_tts_speech = self.token2wav( + token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, + prompt_feat=prompt_speech_feat, + embedding=flow_embedding, + uuid=this_uuid, + finalize=True, + speed=speed, + ) + yield {"tts_speech": this_tts_speech.cpu()} + with self.lock: + self.tts_speech_token_dict.pop(this_uuid) + self.llm_end_dict.pop(this_uuid) + self.mel_overlap_dict.pop(this_uuid) + self.hift_cache_dict.pop(this_uuid) + self.flow_cache_dict.pop(this_uuid) + if paddle.device.cuda.device_count() >= 1: + paddle.device.cuda.empty_cache() + paddle.device.current_stream().synchronize() + + +class CosyVoice2Model(CosyVoiceModel): + def __init__( + self, + llm: paddle.nn.Layer, + flow: paddle.nn.Layer, + hift: paddle.nn.Layer, + fp16: bool = False, + ): + self.device = device2str( + "cuda" if paddle.device.cuda.device_count() >= 1 else "cpu" + ) + self.llm = llm + self.flow = flow + self.hift = hift + self.fp16 = fp16 + if self.fp16 is True: + self.llm.half() + self.flow.half() + self.token_hop_len = 25 + self.mel_cache_len = 8 + self.source_cache_len = int(self.mel_cache_len * 480) + self.speech_window = np.hamming(2 * self.source_cache_len) + self.llm_context = ( + paddle.device.stream_guard( + paddle.device.Stream(device=device2str(self.device)) + ) + if paddle.device.cuda.device_count() >= 1 + else nullcontext() + ) + self.lock = threading.Lock() + self.tts_speech_token_dict = {} + self.llm_end_dict = {} + self.hift_cache_dict = {} + + def load_jit(self, flow_encoder_model): + flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device) + self.flow.encoder = flow_encoder + + def load_vllm(self, model_dir): + export_cosyvoice2_vllm(self.llm, model_dir, self.device) + from vllm import EngineArgs, LLMEngine + + engine_args = EngineArgs( + model=model_dir, + skip_tokenizer_init=True, + enable_prompt_embeds=True, + gpu_memory_utilization=0.2, + ) + self.llm.vllm = LLMEngine.from_engine_args(engine_args) + self.llm.lock = threading.Lock() + del self.llm.llm.model.model.layers + + def token2wav( + self, + token, + prompt_token, + prompt_feat, + embedding, + token_offset, + uuid, + stream=False, + finalize=False, + speed=1.0, + ): + with paddle.amp.auto_cast(enable=self.fp16): + tts_mel, _ = self.flow.inference( + token=token.to(self.device), + token_len=paddle.tensor([token.shape[1]], dtype=paddle.int32).to( + self.device + ), + prompt_token=prompt_token.to(self.device), + prompt_token_len=paddle.tensor( + [prompt_token.shape[1]], dtype=paddle.int32 + ).to(self.device), + prompt_feat=prompt_feat.to(self.device), + prompt_feat_len=paddle.tensor( + [prompt_feat.shape[1]], dtype=paddle.int32 + ).to(self.device), + embedding=embedding.to(self.device), + streaming=stream, + finalize=finalize, + ) + tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio :] + if self.hift_cache_dict[uuid] is not None: + hift_cache_mel, hift_cache_source = ( + self.hift_cache_dict[uuid]["mel"], + self.hift_cache_dict[uuid]["source"], + ) + tts_mel = paddle.cat([hift_cache_mel, tts_mel], dim=2) + else: + hift_cache_source = paddle.zeros([1, 1, 0]) + if finalize is False: + tts_speech, tts_source = self.hift.inference( + speech_feat=tts_mel, cache_source=hift_cache_source + ) + if self.hift_cache_dict[uuid] is not None: + tts_speech = fade_in_out( + tts_speech, self.hift_cache_dict[uuid]["speech"], self.speech_window + ) + self.hift_cache_dict[uuid] = { + "mel": tts_mel[:, :, -self.mel_cache_len :], + "source": tts_source[:, :, -self.source_cache_len :], + "speech": tts_speech[:, -self.source_cache_len :], + } + tts_speech = tts_speech[:, : -self.source_cache_len] + else: + if speed != 1.0: + assert ( + self.hift_cache_dict[uuid] is None + ), "speed change only support non-stream inference mode" + tts_mel = paddle.nn.functional.interpolate( + x=tts_mel, size=int(tts_mel.shape[2] / speed), mode="linear" + ) + tts_speech, tts_source = self.hift.inference( + speech_feat=tts_mel, cache_source=hift_cache_source + ) + if self.hift_cache_dict[uuid] is not None: + tts_speech = fade_in_out( + tts_speech, self.hift_cache_dict[uuid]["speech"], self.speech_window + ) + return tts_speech + + def tts( + self, + text=paddle.zeros([1, 0], dtype=paddle.int32), + flow_embedding=paddle.zeros([0, 192]), + llm_embedding=paddle.zeros([0, 192]), + prompt_text=paddle.zeros([1, 0], dtype=paddle.int32), + llm_prompt_speech_token=paddle.zeros([1, 0], dtype=paddle.int32), + flow_prompt_speech_token=paddle.zeros([1, 0], dtype=paddle.int32), + prompt_speech_feat=paddle.zeros([1, 0, 80]), + source_speech_token=paddle.zeros([1, 0], dtype=paddle.int32), + stream=False, + speed=1.0, + **kwargs + ): + this_uuid = str(uuid.uuid1()) + with self.lock: + self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = ( + [], + False, + ) + self.hift_cache_dict[this_uuid] = None + if source_speech_token.shape[1] == 0: + p = threading.Thread( + target=self.llm_job, + args=( + text, + prompt_text, + llm_prompt_speech_token, + llm_embedding, + this_uuid, + ), + ) + else: + p = threading.Thread( + target=self.vc_job, args=(source_speech_token, this_uuid) + ) + """Not Support auto convert *.start, please judge whether it is Pytorch API and convert by yourself""" + p.start() + if stream is True: + token_offset = 0 + prompt_token_pad = int( + np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) + * self.token_hop_len + - flow_prompt_speech_token.shape[1] + ) + while True: + time.sleep(0.1) + this_token_hop_len = ( + self.token_hop_len + prompt_token_pad + if token_offset == 0 + else self.token_hop_len + ) + if ( + len(self.tts_speech_token_dict[this_uuid]) - token_offset + >= this_token_hop_len + self.flow.pre_lookahead_len + ): + this_tts_speech_token = paddle.tensor( + self.tts_speech_token_dict[this_uuid][ + : token_offset + + this_token_hop_len + + self.flow.pre_lookahead_len + ] + ).unsqueeze(dim=0) + this_tts_speech = self.token2wav( + token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, + prompt_feat=prompt_speech_feat, + embedding=flow_embedding, + token_offset=token_offset, + uuid=this_uuid, + stream=stream, + finalize=False, + ) + token_offset += this_token_hop_len + yield {"tts_speech": this_tts_speech.cpu()} + if ( + self.llm_end_dict[this_uuid] is True + and len(self.tts_speech_token_dict[this_uuid]) - token_offset + < this_token_hop_len + self.flow.pre_lookahead_len + ): + break + p.join() + this_tts_speech_token = paddle.tensor( + self.tts_speech_token_dict[this_uuid] + ).unsqueeze(dim=0) + this_tts_speech = self.token2wav( + token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, + prompt_feat=prompt_speech_feat, + embedding=flow_embedding, + token_offset=token_offset, + uuid=this_uuid, + finalize=True, + ) + yield {"tts_speech": this_tts_speech.cpu()} + else: + p.join() + this_tts_speech_token = paddle.tensor( + self.tts_speech_token_dict[this_uuid] + ).unsqueeze(dim=0) + this_tts_speech = self.token2wav( + token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, + prompt_feat=prompt_speech_feat, + embedding=flow_embedding, + token_offset=0, + uuid=this_uuid, + finalize=True, + speed=speed, + ) + yield {"tts_speech": this_tts_speech.cpu()} + with self.lock: + self.tts_speech_token_dict.pop(this_uuid) + self.llm_end_dict.pop(this_uuid) + self.hift_cache_dict.pop(this_uuid) + if paddle.device.cuda.device_count() >= 1: + paddle.device.cuda.empty_cache() + paddle.device.current_stream().synchronize() \ No newline at end of file diff --git a/paddlespeech/t2s/models/CosyVoice/test.py b/paddlespeech/t2s/models/CosyVoice/test.py new file mode 100644 index 0000000000..d78273f7c8 --- /dev/null +++ b/paddlespeech/t2s/models/CosyVoice/test.py @@ -0,0 +1,2 @@ +import torchaudio +import \ No newline at end of file diff --git a/paddlespeech/t2s/modules/decoder.py b/paddlespeech/t2s/modules/decoder.py new file mode 100644 index 0000000000..b7fa01fc26 --- /dev/null +++ b/paddlespeech/t2s/modules/decoder.py @@ -0,0 +1,9 @@ +class Transpose(torch.nn.Module): + def __init__(self, dim0: int, dim1: int): + super().__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = torch.transpose(x, self.dim0, self.dim1) + return x \ No newline at end of file diff --git a/paddlespeech/t2s/modules/flow/attention.py b/paddlespeech/t2s/modules/flow/attention.py new file mode 100644 index 0000000000..b5a7069d38 --- /dev/null +++ b/paddlespeech/t2s/modules/flow/attention.py @@ -0,0 +1,227 @@ + +class BasicTransformerBlock(nn.Module): + r""" + A basic Transformer block. + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used. + double_self_attention (`bool`, *optional*): + Whether to use two self-attention layers. In this case no cross attention layers are used. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm (: + obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`. + attention_bias (: + obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter. + """ + + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + activation_fn: str = "geglu", + cross_attention_dim: Optional[int] = None, + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", + final_dropout: bool = False, + ): + super().__init__() + self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" + + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: + raise ValueError( + f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" + f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." + ) + # Define 3 blocks. Each block has its own normalization layer. + # 1. Self-Attn + self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=None + upcast_attention=False + ) + # 2. Cross-Attn + self.norm2 = None + self.attn2 = None + self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout) + + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = 0 + def forward(self,hidden_states): + norm_hidden_states = self.norm1(hidden_states) + cross_attention_kwargs = {} + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + norm_hidden_states = self.norm3(hidden_states) + ff_output = self.ff(norm_hidden_states) + hidden_states = ff_output + hidden_states + return hidden_states + +class FeedForward(nn.Layer): + + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + activation_fn: str = "geglu", + final_dropout: bool = False, + ): + super().__init__() + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + + if activation_fn == "gelu": + act_fn = GELU(dim, inner_dim, approximate=False) + elif activation_fn == "gelu-approximate": + act_fn = GELU(dim, inner_dim, approximate=True) + elif activation_fn == "geglu": + act_fn = GEGLU(dim, inner_dim) + elif activation_fn == "geglu-approximate": + act_fn = ApproximateGELU(dim, inner_dim) + elif activation_fn == "snakebeta": + act_fn = SnakeBeta(dim, inner_dim) + else: + act_fn = GEGLU(dim, inner_dim) + + self.net = nn.LayerList() + self.net.append(act_fn) + self.net.append(nn.Dropout(dropout)) + self.net.append(LoRACompatibleLinear(inner_dim, dim_out)) + + if final_dropout: + self.net.append(nn.Dropout(dropout)) + + def forward(self, hidden_states): + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states + +query_dim=dim, +heads=num_attention_heads, +dim_head=attention_head_dim, +dropout=dropout, +bias=attention_bias, +cross_attention_dim=None, +upcast_attention=upcast_attention, +class Attention(nn.Module): + def __init__( + self, + query_dim: int, + cross_attention_dim: Optional[int] = None, + heads: int = 8, + dim_head: int = 64, + dropout: float = 0.0, + bias: bool = False, + upcast_attention: bool = False, + upcast_softmax: bool = False, + cross_attention_norm: Optional[str] = None, + cross_attention_norm_num_groups: int = 32, + qk_norm: Optional[str] = None, + norm_num_groups: Optional[int] = None, + spatial_norm_dim: Optional[int] = None, + out_bias: bool = True, + scale_qk: bool = True, + only_cross_attention: bool = False, + eps: float = 1e-5, + rescale_output_factor: float = 1.0, + processor: Optional["AttnProcessor"] = None, + out_dim: int = None, + ): + super().__init__() + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.query_dim = query_dim + self.use_bias = bias + self.is_cross_attention = cross_attention_dim is not None + self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim + self.upcast_attention = upcast_attention + self.upcast_softmax = upcast_softmax + self.rescale_output_factor = rescale_output_factor + self.dropout = dropout + self.fused_projections = False + self.out_dim = out_dim if out_dim is not None else query_dim + # we make use of this private variable to know whether this class is loaded + # with an deprecated state dict so that we can convert it on the fly + + self.scale_qk = scale_qk + self.scale = dim_head**-0.5 if self.scale_qk else 1.0 + + self.heads = out_dim // dim_head if out_dim is not None else heads + # for slice_size > 0 the attention score computation + # is split across the batch axis to save memory + # You can set slice_size with `set_attention_slice` + self.sliceable_head_dim = heads + self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias) + self.to_k = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias) + self.to_v = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias) + self.to_out = nn.ModuleList([]) + self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)) + self.to_out.append(nn.Dropout(dropout)) + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, + ): + residual = hidden_states + input_ndim = hidden_states.ndim + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + query = attn.to_q(hidden_states) + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + hidden_states = hidden_states / attn.rescale_output_factor + return hidden_states + diff --git a/paddlespeech/t2s/modules/flow/decoder.py b/paddlespeech/t2s/modules/flow/decoder.py new file mode 100644 index 0000000000..4c5208b50d --- /dev/null +++ b/paddlespeech/t2s/modules/flow/decoder.py @@ -0,0 +1,766 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Tuple +import paddle +from paddle import nn +import paddle.nn.functional as F +from einops import pack, rearrange, repeat +from cosyvoice.utils.common import mask_to_bias +from cosyvoice.utils.mask import add_optional_chunk_mask +from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D +from .attention import BasicTransformerBlock + +def get_activation(act_fn): + if act_fn == "silu": + return nn.Silu() + elif act_fn == "mish": + return nn.Mish() + elif act_fn == "relu": + return nn.ReLU() + elif act_fn == "gelu": + return nn.GELU() + else: + raise ValueError(f"Unsupported activation function: {act_fn}") + +class Block1D(nn.Layer): + def __init__(self, dim, dim_out, groups=8): + super().__init__() + self.block = nn.Sequential( + nn.Conv1D(dim, dim_out, 3, padding=1), + nn.GroupNorm(groups, dim_out), + nn.Mish(), + ) + + def forward(self, x, mask): + output = self.block(x * mask) + return output * mask + +class ResnetBlock1D(nn.Layer): + def __init__(self, dim, dim_out, time_emb_dim, groups=8): + super().__init__() + self.mlp = nn.Sequential( + nn.Mish(), + nn.Linear(time_emb_dim, dim_out) + ) + + self.block1 = Block1D(dim, dim_out, groups=groups) + self.block2 = Block1D(dim_out, dim_out, groups=groups) + self.res_conv = nn.Conv1D(dim, dim_out, 1) + + def forward(self, x, mask, time_emb): + h = self.block1(x, mask) + # 添加时间嵌入并调整维度 + h += self.mlp(time_emb).unsqueeze(-1) + h = self.block2(h, mask) + output = h + self.res_conv(x * mask) + return output + +class Downsample1D(nn.Layer): + def __init__(self, dim): + super().__init__() + self.conv = nn.Conv1D(dim, dim, 3, stride=2, padding=1) + + def forward(self, x): + return self.conv(x) + +class TimestepEmbedding(nn.Layer): + def __init__( + self, + in_channels: int, + time_embed_dim: int, + act_fn: str = "silu", + out_dim: int = None, + post_act_fn: Optional[str] = None, + cond_proj_dim=None, + ): + super().__init__() + + self.linear_1 = nn.Linear(in_channels, time_embed_dim) + + if cond_proj_dim is not None: + self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False) + else: + self.cond_proj = None + + self.act = get_activation(act_fn) + + if out_dim is not None: + time_embed_dim_out = out_dim + else: + time_embed_dim_out = time_embed_dim + self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out) + + if post_act_fn is None: + self.post_act = None + else: + self.post_act = get_activation(post_act_fn) + + def forward(self, sample, condition=None): + if condition is not None and self.cond_proj is not None: + sample = sample + self.cond_proj(condition) + sample = self.linear_1(sample) + + if self.act is not None: + sample = self.act(sample) + + sample = self.linear_2(sample) + + if self.post_act is not None: + sample = self.post_act(sample) + return sample + +class Upsample1D(nn.Layer): + """A 1D upsampling layer with an optional convolution. + + Parameters: + channels (`int`): + number of channels in the inputs and outputs. + use_conv (`bool`, default `False`): + option to use a convolution. + use_conv_transpose (`bool`, default `False`): + option to use a convolution transpose. + out_channels (`int`, optional): + number of output channels. Defaults to `channels`. + """ + + def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_conv_transpose = use_conv_transpose + self.name = name + + self.conv = None + if use_conv_transpose: + self.conv = nn.Conv1DTranspose(channels, self.out_channels, 4, stride=2, padding=1) + elif use_conv: + self.conv = nn.Conv1D(self.channels, self.out_channels, 3, padding=1) + + def forward(self, inputs): + assert inputs.shape[1] == self.channels + if self.use_conv_transpose: + return self.conv(inputs) + + outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest") + + if self.use_conv: + outputs = self.conv(outputs) + + return outputs + +class Transpose(nn.Module): + def __init__(self, dim0: int, dim1: int): + super().__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = paddle.transpose(x, (self.dim0, self.dim1)) + return x + +class CausalConv1d(nn.Conv1d): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + dilation: int = 1, + groups: int = 1, + padding_mode: str = 'zeros' + ) -> None: + super(CausalConv1d, self).__init__(in_channels, out_channels, + kernel_size, stride, + padding=0, dilation=dilation, + groups=groups, + padding_mode=padding_mode) + assert stride == 1 + self.causal_padding = kernel_size - 1 + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = F.pad(x, (self.causal_padding, 0), value=0.0) + x = super(CausalConv1d, self).forward(x) + return x + +class SinusoidalPosEmb(paddle.nn.Layer): + def __init__(self, dim): + super().__init__() + self.dim = dim + assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even" + def forward(self, x, scale=1000): + if x.ndim < 1: + x = x.unsqueeze(0) + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = paddle.exp(paddle.arange(half_dim).astype('float32') * -emb) + emb = scale * x.unsqueeze(1) * emb.unsqueeze(0) + emb = paddle.concat([paddle.sin(emb), paddle.cos(emb)], axis=-1) + return emb + +class CausalBlock1D(Block1D): + def __init__(self, dim: int, dim_out: int): + super(CausalBlock1D, self).__init__(dim, dim_out) + self.block = nn.Sequential( + CausalConv1d(dim, dim_out, 3), + Transpose(1, 2), + nn.LayerNorm(dim_out), + Transpose(1, 2), + nn.Mish() + ) + + def forward(self, x: paddle.Tensor, mask: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: + output = self.block(x * mask) + return output * mask + + +class CausalResnetBlock1D(ResnetBlock1D): + def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8): + super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups) + self.block1 = CausalBlock1D(dim, dim_out) + self.block2 = CausalBlock1D(dim_out, dim_out) + +def subsequent_chunk_mask( + size: int, + chunk_size: int, + num_left_chunks: int = -1, +) -> paddle.Tensor: + """Create mask for subsequent steps (size, size) with chunk size, + this is for streaming encoder + + Args: + size (int): size of mask + chunk_size (int): size of chunk + num_left_chunks (int): number of left chunks + <0: use full chunk + >=0: use num_left_chunks + + Returns: + paddle.Tensor: mask + + Examples: + >>> subsequent_chunk_mask(4, 2) + [[1, 1, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 1], + [1, 1, 1, 1]] + """ + pos_idx = paddle.arange(size, dtype='int64') + block_value = (paddle.floor_divide(pos_idx, chunk_size) + 1) * chunk_size + ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1) + + return ret + + +def add_optional_chunk_mask(xs: paddle.Tensor, + masks: paddle.Tensor, + use_dynamic_chunk: bool, + use_dynamic_left_chunk: bool, + decoding_chunk_size: int, + static_chunk_size: int, + num_decoding_left_chunks: int, + enable_full_context: bool = True): + """ Apply optional mask for encoder. + + Args: + xs (paddle.Tensor): padded input, (B, L, D), L for max length + mask (paddle.Tensor): mask for xs, (B, 1, L) + use_dynamic_chunk (bool): whether to use dynamic chunk or not + use_dynamic_left_chunk (bool): whether to use dynamic left chunk for + training. + decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + static_chunk_size (int): chunk size for static chunk training/decoding + if it's greater than 0, if use_dynamic_chunk is true, + this parameter will be ignored + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + enable_full_context (bool): + True: chunk size is either [1, 25] or full context(max_len) + False: chunk size ~ U[1, 25] + + Returns: + paddle.Tensor: chunk mask of the input xs. + """ + # Whether to use chunk mask or not + if use_dynamic_chunk: + max_len = xs.shape[1] + if decoding_chunk_size < 0: + chunk_size = max_len + num_left_chunks = -1 + elif decoding_chunk_size > 0: + chunk_size = decoding_chunk_size + num_left_chunks = num_decoding_left_chunks + else: + # chunk size is either [1, 25] or full context(max_len). + # Since we use 4 times subsampling and allow up to 1s(100 frames) + # delay, the maximum frame is 100 / 4 = 25. + chunk_size = paddle.randint(1, max_len, shape=(1,)).item() + num_left_chunks = -1 + if chunk_size > max_len // 2 and enable_full_context: + chunk_size = max_len + else: + chunk_size = chunk_size % 25 + 1 + if use_dynamic_left_chunk: + max_left_chunks = (max_len - 1) // chunk_size + num_left_chunks = paddle.randint(0, max_left_chunks, shape=(1,)).item() + + chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size, + num_left_chunks) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) + elif static_chunk_size > 0: + num_left_chunks = num_decoding_left_chunks + chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size, + num_left_chunks) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) + else: + chunk_masks = masks + + assert chunk_masks.dtype == 'bool' + if (chunk_masks.sum(axis=-1) == 0).sum().item() != 0: + print('get chunk_masks all false at some timestep, force set to true, make sure they are masked in future computation!') + all_false_mask = chunk_masks.sum(axis=-1) == 0 + chunk_masks = paddle.where(all_false_mask.unsqueeze(-1), paddle.ones_like(chunk_masks, dtype='bool'), chunk_masks) + + return chunk_masks + +def mask_to_bias(mask: paddle.Tensor, dtype: str) -> paddle.Tensor: + assert mask.dtype == 'bool', "Input mask must be of boolean type" + assert dtype in ['float32', 'bfloat16', 'float16'], f"Unsupported dtype: {dtype}" + mask = mask.astype(dtype) + mask = (1.0 - mask) * -1.0e+10 + + return mask + +class ConditionalDecoder(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + channels=(256, 256), + dropout=0.05, + attention_head_dim=64, + n_blocks=1, + num_mid_blocks=2, + num_heads=4, + act_fn="snake", + ): + """ + This decoder requires an input with the same shape of the target. So, if your text content + is shorter or longer than the outputs, please re-sampling it before feeding to the decoder. + """ + super().__init__() + channels = tuple(channels) + self.in_channels = in_channels + self.out_channels = out_channels + + self.time_embeddings = SinusoidalPosEmb(in_channels) + time_embed_dim = channels[0] * 4 + self.time_mlp = TimestepEmbedding( + in_channels=in_channels, + time_embed_dim=time_embed_dim, + act_fn="silu", + ) + self.down_blocks = nn.LayerList([]) + self.mid_blocks = nn.LayerList([]) + self.up_blocks = nn.LayerList([]) + + output_channel = in_channels + for i in range(len(channels)): + input_channel = output_channel + output_channel = channels[i] + is_last = i == len(channels) - 1 + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + downsample = ( + Downsample1D(output_channel) if not is_last else nn.Conv1D(output_channel, output_channel, 3, padding=1) + ) + self.down_blocks.append(nn.LayerList([resnet, transformer_blocks, downsample])) + + for _ in range(num_mid_blocks): + input_channel = channels[-1] + out_channels = channels[-1] + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + + transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + + self.mid_blocks.append(nn.LayerList([resnet, transformer_blocks])) + + channels = channels[::-1] + (channels[0],) + for i in range(len(channels) - 1): + input_channel = channels[i] * 2 + output_channel = channels[i + 1] + is_last = i == len(channels) - 2 + resnet = ResnetBlock1D( + dim=input_channel, + dim_out=output_channel, + time_emb_dim=time_embed_dim, + ) + transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + upsample = ( + Upsample1D(output_channel, use_conv_transpose=True) + if not is_last + else nn.Conv1D(output_channel, output_channel, 3, padding=1) + ) + self.up_blocks.append(nn.LayerList([resnet, transformer_blocks, upsample])) + self.final_block = Block1D(channels[-1], channels[-1]) + self.final_proj = nn.Conv1D(channels[-1], self.out_channels, 1) + self.initialize_weights() + + def initialize_weights(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv1D): + nn.initializer.KaimingNormal(m.weight, nonlinearity='relu') + if m.bias is not None: + nn.initializer.Constant(m.bias, value=0) + elif isinstance(m, nn.GroupNorm): + nn.initializer.Constant(m.weight, value=1) + nn.initializer.Constant(m.bias, value=0) + elif isinstance(m, nn.Linear): + nn.initializer.KaimingNormal(m.weight, nonlinearity='relu') + if m.bias is not None: + nn.initializer.Constant(m.bias, value=0) + + def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False): + """Forward pass of the UNet1DConditional model. + + Args: + x (paddle.Tensor): shape (batch_size, in_channels, time) + mask (paddle.Tensor): shape (batch_size, 1, time) + t (paddle.Tensor): shape (batch_size) + spks (paddle.Tensor, optional): shape: (batch_size, condition_channels). Defaults to None. + cond (paddle.Tensor, optional): placeholder for future use. Defaults to None. + + Returns: + paddle.Tensor: output tensor + """ + + t = self.time_embeddings(t).astype(t.dtype) + t = self.time_mlp(t) + + x = pack([x, mu], "b * t")[0] + + if spks is not None: + spks = repeat(spks, "b c -> b c t", t=x.shape[-1]) + x = pack([x, spks], "b * t")[0] + if cond is not None: + x = pack([x, cond], "b * t")[0] + + hiddens = [] + masks = [mask] + for resnet, transformer_blocks, downsample in self.down_blocks: + mask_down = masks[-1] + x = resnet(x, mask_down, t) + x = rearrange(x, "b c t -> b t c").contiguous() + attn_mask = add_optional_chunk_mask(x, mask_down.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1) + attn_mask = mask_to_bias(attn_mask, x.dtype) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + hiddens.append(x) # Save hidden states for skip connections + x = downsample(x * mask_down) + masks.append(mask_down[:, :, ::2]) + masks = masks[:-1] + mask_mid = masks[-1] + + for resnet, transformer_blocks in self.mid_blocks: + x = resnet(x, mask_mid, t) + x = rearrange(x, "b c t -> b t c").contiguous() + attn_mask = add_optional_chunk_mask(x, mask_mid.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1) + attn_mask = mask_to_bias(attn_mask, x.dtype) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + + for resnet, transformer_blocks, upsample in self.up_blocks: + mask_up = masks.pop() + skip = hiddens.pop() + x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0] + x = resnet(x, mask_up, t) + x = rearrange(x, "b c t -> b t c").contiguous() + attn_mask = add_optional_chunk_mask(x, mask_up.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1) + attn_mask = mask_to_bias(attn_mask, x.dtype) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + x = upsample(x * mask_up) + x = self.final_block(x, mask_up) + output = self.final_proj(x * mask_up) + return output * mask + + +class CausalConditionalDecoder(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + channels=(256, 256), + dropout=0.05, + attention_head_dim=64, + n_blocks=1, + num_mid_blocks=2, + num_heads=4, + act_fn="snake", + static_chunk_size=50, + num_decoding_left_chunks=2, + ): + """ + This decoder requires an input with the same shape of the target. So, if your text content + is shorter or longer than the outputs, please re-sampling it before feeding to the decoder. + """ + super().__init__() + channels = tuple(channels) + self.in_channels = in_channels + self.out_channels = out_channels + self.time_embeddings = SinusoidalPosEmb(in_channels) + time_embed_dim = channels[0] * 4 + self.time_mlp = TimestepEmbedding( + in_channels=in_channels, + time_embed_dim=time_embed_dim, + act_fn="silu", + ) + self.static_chunk_size = static_chunk_size + self.num_decoding_left_chunks = num_decoding_left_chunks + self.down_blocks = nn.LayerList([]) + self.mid_blocks = nn.LayerList([]) + self.up_blocks = nn.LayerList([]) + + output_channel = in_channels + for i in range(len(channels)): + input_channel = output_channel + output_channel = channels[i] + is_last = i == len(channels) - 1 + resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + downsample = ( + Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3) # 假设已实现 + ) + self.down_blocks.append(nn.LayerList([resnet, transformer_blocks, downsample])) + + for _ in range(num_mid_blocks): + input_channel = channels[-1] + out_channels = channels[-1] + resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + + transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + + self.mid_blocks.append(nn.LayerList([resnet, transformer_blocks])) + + channels = channels[::-1] + (channels[0],) + for i in range(len(channels) - 1): + input_channel = channels[i] * 2 + output_channel = channels[i + 1] + is_last = i == len(channels) - 2 + resnet = CausalResnetBlock1D( + dim=input_channel, + dim_out=output_channel, + time_emb_dim=time_embed_dim, + ) + transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + upsample = ( + Upsample1D(output_channel, use_conv_transpose=True) # 假设已实现 + if not is_last + else CausalConv1d(output_channel, output_channel, 3) + ) + self.up_blocks.append(nn.LayerList([resnet, transformer_blocks, upsample])) + self.final_block = CausalBlock1D(channels[-1], channels[-1]) # 假设已实现 + self.final_proj = nn.Conv1D(channels[-1], self.out_channels, 1) # 使用 Conv1D + self.initialize_weights() + + def initialize_weights(self): + for m in self.sublayers(): # 使用 sublayers() 而不是 modules() + if isinstance(m, nn.Conv1D): + nn.initializer.KaimingNormal(m.weight, nonlinearity='relu') + if m.bias is not None: + nn.initializer.Constant(m.bias, value=0) + elif isinstance(m, nn.GroupNorm): + nn.initializer.Constant(m.weight, value=1) + nn.initializer.Constant(m.bias, value=0) + elif isinstance(m, nn.Linear): + nn.initializer.KaimingNormal(m.weight, nonlinearity='relu') + if m.bias is not None: + nn.initializer.Constant(m.bias, value=0) + + def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False): + """Forward pass of the UNet1DConditional model. + + Args: + x (paddle.Tensor): shape (batch_size, in_channels, time) + mask (paddle.Tensor): shape (batch_size, 1, time) + mu (paddle.Tensor): mean tensor for conditioning + t (paddle.Tensor): shape (batch_size) + spks (paddle.Tensor, optional): shape: (batch_size, condition_channels). Defaults to None. + cond (paddle.Tensor, optional): placeholder for future use. Defaults to None. + streaming (bool, optional): whether to use streaming mode. Defaults to False. + + Returns: + paddle.Tensor: output tensor + """ + t = self.time_embeddings(t).astype(t.dtype) # 使用 astype 代替 .to(t.dtype) + t = self.time_mlp(t) + + x = pack([x, mu], "b * t")[0] # 假设 pack 函数已实现 + + if spks is not None: + spks = repeat(spks, "b c -> b c t", t=x.shape[-1]) # 假设 repeat 函数已实现 + x = pack([x, spks], "b * t")[0] + if cond is not None: + x = pack([x, cond], "b * t")[0] + + hiddens = [] + masks = [mask] + for resnet, transformer_blocks, downsample in self.down_blocks: + mask_down = masks[-1] + x = resnet(x, mask_down, t) + x = rearrange(x, "b c t -> b t c").contiguous() # 假设 rearrange 函数已实现 + if streaming is True: + attn_mask = add_optional_chunk_mask(x, mask_down.astype('bool'), False, False, 0, self.static_chunk_size, -1) # 使用 astype('bool') + else: + attn_mask = add_optional_chunk_mask(x, mask_down.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1) # 使用 .shape 而不是 .size() + attn_mask = mask_to_bias(attn_mask, x.dtype) # 假设 mask_to_bias 函数已实现 + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + hiddens.append(x) # Save hidden states for skip connections + x = downsample(x * mask_down) + masks.append(mask_down[:, :, ::2]) + masks = masks[:-1] + mask_mid = masks[-1] + + for resnet, transformer_blocks in self.mid_blocks: + x = resnet(x, mask_mid, t) + x = rearrange(x, "b c t -> b t c").contiguous() + if streaming is True: + attn_mask = add_optional_chunk_mask(x, mask_mid.astype('bool'), False, False, 0, self.static_chunk_size, -1) + else: + attn_mask = add_optional_chunk_mask(x, mask_mid.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1) + attn_mask = mask_to_bias(attn_mask, x.dtype) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + + for resnet, transformer_blocks, upsample in self.up_blocks: + mask_up = masks.pop() + skip = hiddens.pop() + x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0] + x = resnet(x, mask_up, t) + x = rearrange(x, "b c t -> b t c").contiguous() + if streaming is True: + attn_mask = add_optional_chunk_mask(x, mask_up.astype('bool'), False, False, 0, self.static_chunk_size, -1) + else: + attn_mask = add_optional_chunk_mask(x, mask_up.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1) + attn_mask = mask_to_bias(attn_mask, x.dtype) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + x = upsample(x * mask_up) + x = self.final_block(x, mask_up) + output = self.final_proj(x * mask_up) + return output * mask diff --git a/paddlespeech/t2s/modules/flow/flow.py b/paddlespeech/t2s/modules/flow/flow.py new file mode 100644 index 0000000000..c22b7a2622 --- /dev/null +++ b/paddlespeech/t2s/modules/flow/flow.py @@ -0,0 +1,320 @@ +import logging +import random +from typing import Dict, Optional + +import paddle +from omegaconf import DictConfig + +from cosyvoice.utils.mask import make_pad_mask + + +class MaskedDiffWithXvec(paddle.nn.Layer): + def __init__( + self, + input_size: int = 512, + output_size: int = 80, + spk_embed_dim: int = 192, + output_type: str = "mel", + vocab_size: int = 4096, + input_frame_rate: int = 50, + only_mask_loss: bool = True, + encoder: paddle.nn.Layer = None, + length_regulator: paddle.nn.Layer = None, + decoder: paddle.nn.Layer = None, + decoder_conf: Dict = { + "in_channels": 240, + "out_channel": 80, + "spk_emb_dim": 80, + "n_spks": 1, + "cfm_params": DictConfig( + { + "sigma_min": 1e-06, + "solver": "euler", + "t_scheduler": "cosine", + "training_cfg_rate": 0.2, + "inference_cfg_rate": 0.7, + "reg_loss_type": "l1", + } + ), + "decoder_params": { + "channels": [256, 256], + "dropout": 0.0, + "attention_head_dim": 64, + "n_blocks": 4, + "num_mid_blocks": 12, + "num_heads": 8, + "act_fn": "gelu", + }, + }, + mel_feat_conf: Dict = { + "n_fft": 1024, + "num_mels": 80, + "sampling_rate": 22050, + "hop_size": 256, + "win_size": 1024, + "fmin": 0, + "fmax": 8000, + }, + ): + super().__init__() + self.input_size = input_size + self.output_size = output_size + self.decoder_conf = decoder_conf + self.mel_feat_conf = mel_feat_conf + self.vocab_size = vocab_size + self.output_type = output_type + self.input_frame_rate = input_frame_rate + logging.info(f"input frame rate={self.input_frame_rate}") + self.input_embedding = paddle.nn.Embedding(vocab_size, input_size) + self.spk_embed_affine_layer = paddle.nn.Linear( + in_features=spk_embed_dim, out_features=output_size + ) + self.encoder = encoder + self.encoder_proj = paddle.nn.Linear( + in_features=self.encoder.output_size(), out_features=output_size + ) + self.decoder = decoder + self.length_regulator = length_regulator + self.only_mask_loss = only_mask_loss + + def forward( +>>>>>> self, batch: dict, device: torch.device + ) -> Dict[str, Optional[paddle.Tensor]]: + token = batch["speech_token"].to(device) + token_len = batch["speech_token_len"].to(device) + feat = batch["speech_feat"].to(device) + feat_len = batch["speech_feat_len"].to(device) + embedding = batch["embedding"].to(device) + embedding = paddle.nn.functional.normalize(x=embedding, axis=1) + embedding = self.spk_embed_affine_layer(embedding) + mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device) + token = self.input_embedding(paddle.clamp(token, min=0)) * mask + h, h_lengths = self.encoder(token, token_len) + h = self.encoder_proj(h) + h, h_lengths = self.length_regulator(h, feat_len) + conds = paddle.zeros(feat.shape, device=token.place) + for i, j in enumerate(feat_len): + if random.random() < 0.5: + continue + index = random.randint(0, int(0.3 * j)) + conds[i, :index] = feat[i, :index] + conds = conds.transpose(1, 2) + mask = (~make_pad_mask(feat_len)).to(h) + loss, _ = self.decoder.compute_loss( + feat.transpose(1, 2).contiguous(), + mask.unsqueeze(1), + h.transpose(1, 2).contiguous(), + embedding, + cond=conds, + ) + return {"loss": loss} + + @paddle.no_grad() + def inference( + self, + token, + token_len, + prompt_token, + prompt_token_len, + prompt_feat, + prompt_feat_len, + embedding, + flow_cache, + ): + assert token.shape[0] == 1 + embedding = paddle.nn.functional.normalize(x=embedding, axis=1) + embedding = self.spk_embed_affine_layer(embedding) + token_len1, token_len2 = prompt_token.shape[1], token.shape[1] + token, token_len = ( + paddle.cat([prompt_token, token], dim=1), + prompt_token_len + token_len, + ) + mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding) + token = self.input_embedding(paddle.clamp(token, min=0)) * mask + h, h_lengths = self.encoder(token, token_len) + h = self.encoder_proj(h) + mel_len1, mel_len2 = prompt_feat.shape[1], int( + token_len2 / self.input_frame_rate * 22050 / 256 + ) + h, h_lengths = self.length_regulator.inference( + h[:, :token_len1], + h[:, token_len1:], + mel_len1, + mel_len2, + self.input_frame_rate, + ) + conds = paddle.zeros( + [1, mel_len1 + mel_len2, self.output_size], device=token.place + ).to(h.dtype) + conds[:, :mel_len1] = prompt_feat + conds = conds.transpose(1, 2) + mask = (~make_pad_mask(paddle.tensor([mel_len1 + mel_len2]))).to(h) + feat, flow_cache = self.decoder( + mu=h.transpose(1, 2).contiguous(), + mask=mask.unsqueeze(1), + spks=embedding, + cond=conds, + n_timesteps=10, + prompt_len=mel_len1, + cache=flow_cache, + ) + feat = feat[:, :, mel_len1:] + assert feat.shape[2] == mel_len2 + return feat.float(), flow_cache + + +class CausalMaskedDiffWithXvec(paddle.nn.Layer): + def __init__( + self, + input_size: int = 512, + output_size: int = 80, + spk_embed_dim: int = 192, + output_type: str = "mel", + vocab_size: int = 6561, + input_frame_rate: int = 25, + only_mask_loss: bool = True, + token_mel_ratio: int = 2, + pre_lookahead_len: int = 3, + encoder: paddle.nn.Layer = None, + decoder: paddle.nn.Layer = None, + decoder_conf: Dict = { + "in_channels": 240, + "out_channel": 80, + "spk_emb_dim": 80, + "n_spks": 1, + "cfm_params": DictConfig( + { + "sigma_min": 1e-06, + "solver": "euler", + "t_scheduler": "cosine", + "training_cfg_rate": 0.2, + "inference_cfg_rate": 0.7, + "reg_loss_type": "l1", + } + ), + "decoder_params": { + "channels": [256, 256], + "dropout": 0.0, + "attention_head_dim": 64, + "n_blocks": 4, + "num_mid_blocks": 12, + "num_heads": 8, + "act_fn": "gelu", + }, + }, + mel_feat_conf: Dict = { + "n_fft": 1024, + "num_mels": 80, + "sampling_rate": 22050, + "hop_size": 256, + "win_size": 1024, + "fmin": 0, + "fmax": 8000, + }, + ): + super().__init__() + self.input_size = input_size + self.output_size = output_size + self.decoder_conf = decoder_conf + self.mel_feat_conf = mel_feat_conf + self.vocab_size = vocab_size + self.output_type = output_type + self.input_frame_rate = input_frame_rate + logging.info(f"input frame rate={self.input_frame_rate}") + self.input_embedding = paddle.nn.Embedding(vocab_size, input_size) + self.spk_embed_affine_layer = paddle.nn.Linear( + in_features=spk_embed_dim, out_features=output_size + ) + self.encoder = encoder + self.encoder_proj = paddle.nn.Linear( + in_features=self.encoder.output_size(), out_features=output_size + ) + self.decoder = decoder + self.only_mask_loss = only_mask_loss + self.token_mel_ratio = token_mel_ratio + self.pre_lookahead_len = pre_lookahead_len + + def forward( +>>>>>> self, batch: dict, device: torch.device + ) -> Dict[str, Optional[paddle.Tensor]]: + token = batch["speech_token"].to(device) + token_len = batch["speech_token_len"].to(device) + feat = batch["speech_feat"].to(device) + feat_len = batch["speech_feat_len"].to(device) + embedding = batch["embedding"].to(device) + streaming = True if random.random() < 0.5 else False + embedding = paddle.nn.functional.normalize(x=embedding, axis=1) + embedding = self.spk_embed_affine_layer(embedding) + mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device) + token = self.input_embedding(paddle.clamp(token, min=0)) * mask + h, h_lengths = self.encoder(token, token_len, streaming=streaming) + h = self.encoder_proj(h) + conds = paddle.zeros(feat.shape, device=token.place) + for i, j in enumerate(feat_len): + if random.random() < 0.5: + continue + index = random.randint(0, int(0.3 * j)) + conds[i, :index] = feat[i, :index] + conds = conds.transpose(1, 2) + mask = (~make_pad_mask(h_lengths.sum(dim=-1).squeeze(dim=1))).to(h) + loss, _ = self.decoder.compute_loss( + feat.transpose(1, 2).contiguous(), + mask.unsqueeze(1), + h.transpose(1, 2).contiguous(), + embedding, + cond=conds, + streaming=streaming, + ) + return {"loss": loss} + + @paddle.no_grad() + def inference( + self, + token, + token_len, + prompt_token, + prompt_token_len, + prompt_feat, + prompt_feat_len, + embedding, + streaming, + finalize, + ): + assert token.shape[0] == 1 + embedding = paddle.nn.functional.normalize(x=embedding, axis=1) + embedding = self.spk_embed_affine_layer(embedding) + token, token_len = ( + paddle.cat([prompt_token, token], dim=1), + prompt_token_len + token_len, + ) + mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding) + token = self.input_embedding(paddle.clamp(token, min=0)) * mask + if finalize is True: + h, h_lengths = self.encoder(token, token_len, streaming=streaming) + else: + token, context = ( + token[:, : -self.pre_lookahead_len], + token[:, -self.pre_lookahead_len :], + ) + h, h_lengths = self.encoder( + token, token_len, context=context, streaming=streaming + ) + mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1] + h = self.encoder_proj(h) + conds = paddle.zeros( + [1, mel_len1 + mel_len2, self.output_size], device=token.place + ).to(h.dtype) + conds[:, :mel_len1] = prompt_feat + conds = conds.transpose(1, 2) + mask = (~make_pad_mask(paddle.tensor([mel_len1 + mel_len2]))).to(h) + feat, _ = self.decoder( + mu=h.transpose(1, 2).contiguous(), + mask=mask.unsqueeze(1), + spks=embedding, + cond=conds, + n_timesteps=10, + streaming=streaming, + ) + feat = feat[:, :, mel_len1:] + assert feat.shape[2] == mel_len2 + return feat.float(), None diff --git a/paddlespeech/t2s/modules/flow/flow_matching.py b/paddlespeech/t2s/modules/flow/flow_matching.py new file mode 100644 index 0000000000..bf8c9f7a0f --- /dev/null +++ b/paddlespeech/t2s/modules/flow/flow_matching.py @@ -0,0 +1,250 @@ +import paddle +from matcha.models.components.flow_matching import BASECFM + +from cosyvoice.utils.common import set_all_random_seed + + +class ConditionalCFM(BASECFM): + def __init__( + self, + in_channels, + cfm_params, + n_spks=1, + spk_emb_dim=64, + estimator: paddle.nn.Layer = None, + ): + super().__init__( + n_feats=in_channels, + cfm_params=cfm_params, + n_spks=n_spks, + spk_emb_dim=spk_emb_dim, + ) + self.t_scheduler = cfm_params.t_scheduler + self.training_cfg_rate = cfm_params.training_cfg_rate + self.inference_cfg_rate = cfm_params.inference_cfg_rate + in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0) + self.estimator = estimator + + @paddle.no_grad() + def forward( + self, + mu, + mask, + n_timesteps, + temperature=1.0, + spks=None, + cond=None, + prompt_len=0, + cache=paddle.zeros(1, 80, 0, 2), + ): + """Forward diffusion + + Args: + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + n_timesteps (int): number of diffusion steps + temperature (float, optional): temperature for scaling noise. Defaults to 1.0. + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + + Returns: + sample: generated mel-spectrogram + shape: (batch_size, n_feats, mel_timesteps) + """ + z = ( + paddle.randn(shape=mu.shape, dtype=mu.dtype).to(mu.place).to(mu.dtype) + * temperature + ) + cache_size = cache.shape[2] + if cache_size != 0: + z[:, :, :cache_size] = cache[:, :, :, 0] + mu[:, :, :cache_size] = cache[:, :, :, 1] + z_cache = paddle.cat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2) + mu_cache = paddle.cat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2) + cache = paddle.stack([z_cache, mu_cache], dim=-1) + t_span = paddle.linspace(start=0, stop=1, num=n_timesteps + 1, dtype=mu.dtype) + if self.t_scheduler == "cosine": + t_span = 1 - paddle.cos(t_span * 0.5 * paddle.pi) + return ( + self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), + cache, + ) + + def solve_euler(self, x, t_span, mu, mask, spks, cond, streaming=False): + """ + Fixed euler solver for ODEs. + Args: + x (torch.Tensor): random noise + t_span (torch.Tensor): n_timesteps interpolated + shape: (n_timesteps + 1,) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + """ + t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] + t = t.unsqueeze(dim=0) + sol = [] + x_in = paddle.zeros([2, 80, x.size(2)], device=x.place, dtype=x.dtype) + mask_in = paddle.zeros([2, 1, x.size(2)], device=x.place, dtype=x.dtype) + mu_in = paddle.zeros([2, 80, x.size(2)], device=x.place, dtype=x.dtype) + t_in = paddle.zeros([2], device=x.place, dtype=x.dtype) + spks_in = paddle.zeros([2, 80], device=x.place, dtype=x.dtype) + cond_in = paddle.zeros([2, 80, x.size(2)], device=x.place, dtype=x.dtype) + for step in range(1, len(t_span)): + x_in[:] = x + mask_in[:] = mask + mu_in[0] = mu + t_in[:] = t.unsqueeze(0) + spks_in[0] = spks + cond_in[0] = cond + dphi_dt = self.forward_estimator( + x_in, mask_in, mu_in, t_in, spks_in, cond_in, streaming + ) + dphi_dt, cfg_dphi_dt = paddle.compat.split( + dphi_dt, [x.size(0), x.size(0)], dim=0 + ) + dphi_dt = ( + 1.0 + self.inference_cfg_rate + ) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt + x = x + dt * dphi_dt + t = t + dt + sol.append(x) + if step < len(t_span) - 1: + dt = t_span[step + 1] - t + return sol[-1].float() + + def forward_estimator(self, x, mask, mu, t, spks, cond, streaming=False): + if isinstance(self.estimator, paddle.nn.Layer): + return self.estimator(x, mask, mu, t, spks, cond, streaming=streaming) + else: + [estimator, stream], trt_engine = self.estimator.acquire_estimator() + paddle.device.current_stream().synchronize() + with stream: + estimator.set_input_shape("x", (2, 80, x.size(2))) + estimator.set_input_shape("mask", (2, 1, x.size(2))) + estimator.set_input_shape("mu", (2, 80, x.size(2))) + estimator.set_input_shape("t", (2,)) + estimator.set_input_shape("spks", (2, 80)) + estimator.set_input_shape("cond", (2, 80, x.size(2))) + data_ptrs = [ + x.contiguous().data_ptr(), + mask.contiguous().data_ptr(), + mu.contiguous().data_ptr(), + t.contiguous().data_ptr(), + spks.contiguous().data_ptr(), + cond.contiguous().data_ptr(), + x.data_ptr(), + ] + for i, j in enumerate(data_ptrs): + estimator.set_tensor_address(trt_engine.get_tensor_name(i), j) + assert ( + estimator.execute_async_v3( + paddle.device.current_stream().cuda_stream + ) + is True + ) + paddle.device.current_stream().synchronize() + self.estimator.release_estimator(estimator, stream) + return x + + def compute_loss(self, x1, mask, mu, spks=None, cond=None, streaming=False): + """Computes diffusion loss + + Args: + x1 (torch.Tensor): Target + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): target mask + shape: (batch_size, 1, mel_timesteps) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + spks (torch.Tensor, optional): speaker embedding. Defaults to None. + shape: (batch_size, spk_emb_dim) + + Returns: + loss: conditional flow matching loss + y: conditional flow + shape: (batch_size, n_feats, mel_timesteps) + """ + b, _, t = mu.shape + t = paddle.rand(shape=[b, 1, 1], dtype=mu.dtype) + if self.t_scheduler == "cosine": + t = 1 - paddle.cos(t * 0.5 * paddle.pi) + z = paddle.randn(shape=x1.shape, dtype=x1.dtype) + y = (1 - (1 - self.sigma_min) * t) * z + t * x1 + u = x1 - (1 - self.sigma_min) * z + if self.training_cfg_rate > 0: + cfg_mask = paddle.rand(shape=b) > self.training_cfg_rate + mu = mu * cfg_mask.view(-1, 1, 1) + spks = spks * cfg_mask.view(-1, 1) + cond = cond * cfg_mask.view(-1, 1, 1) + pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming) + loss = paddle.nn.functional.mse_loss( + input=pred * mask, label=u * mask, reduction="sum" + ) / (paddle.sum(mask) * u.shape[1]) + return loss, y + + +class CausalConditionalCFM(ConditionalCFM): + def __init__( + self, + in_channels, + cfm_params, + n_spks=1, + spk_emb_dim=64, + estimator: paddle.nn.Layer = None, + ): + super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator) + set_all_random_seed(0) + self.rand_noise = paddle.randn([1, 80, 50 * 300]) + + @paddle.no_grad() + def forward( + self, + mu, + mask, + n_timesteps, + temperature=1.0, + spks=None, + cond=None, + streaming=False, + ): + """Forward diffusion + + Args: + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + n_timesteps (int): number of diffusion steps + temperature (float, optional): temperature for scaling noise. Defaults to 1.0. + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + + Returns: + sample: generated mel-spectrogram + shape: (batch_size, n_feats, mel_timesteps) + """ + z = self.rand_noise[:, :, : mu.size(2)].to(mu.place).to(mu.dtype) * temperature + t_span = paddle.linspace(start=0, stop=1, num=n_timesteps + 1, dtype=mu.dtype) + if self.t_scheduler == "cosine": + t_span = 1 - paddle.cos(t_span * 0.5 * paddle.pi) + return ( + self.solve_euler( + z, + t_span=t_span, + mu=mu, + mask=mask, + spks=spks, + cond=cond, + streaming=streaming, + ), + None, + ) diff --git a/paddlespeech/t2s/modules/flow/length_regulator.py b/paddlespeech/t2s/modules/flow/length_regulator.py new file mode 100644 index 0000000000..db6a35818b --- /dev/null +++ b/paddlespeech/t2s/modules/flow/length_regulator.py @@ -0,0 +1,91 @@ +from typing import Tuple + +import paddle + +from cosyvoice.utils.mask import make_pad_mask + +############################## 相关utils函数,如下 ############################## + +def _Tensor_max(self, *args, **kwargs): + if "other" in kwargs: + kwargs["y"] = kwargs.pop("other") + ret = paddle.maximum(self, *args, **kwargs) + elif len(args) == 1 and isinstance(args[0], paddle.Tensor): + ret = paddle.maximum(self, *args, **kwargs) + else: + if "dim" in kwargs: + kwargs["axis"] = kwargs.pop("dim") + + if "axis" in kwargs or len(args) >= 1: + ret = paddle.max(self, *args, **kwargs), paddle.argmax(self, *args, **kwargs) + else: + ret = paddle.max(self, *args, **kwargs) + + return ret + +setattr(paddle.Tensor, "_max", _Tensor_max) +############################## 相关utils函数,如上 ############################## + + + +class InterpolateRegulator(paddle.nn.Layer): + def __init__( + self, + channels: int, + sampling_ratios: Tuple, + out_channels: int = None, + groups: int = 1, + ): + super().__init__() + self.sampling_ratios = sampling_ratios + out_channels = out_channels or channels + model = paddle.nn.LayerList(sublayers=[]) + if len(sampling_ratios) > 0: + for _ in sampling_ratios: + module = paddle.nn.Conv1d(channels, channels, 3, 1, 1) + norm = paddle.nn.GroupNorm(num_groups=groups, num_channels=channels) + act = paddle.nn.Mish() + model.extend([module, norm, act]) + model.append(paddle.nn.Conv1d(channels, out_channels, 1, 1)) + self.model = paddle.nn.Sequential(*model) + + def forward(self, x, ylens=None): + mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1) + x = paddle.nn.functional.interpolate( + x=x.transpose(1, 2).contiguous(), size=ylens._max(), mode="linear" + ) + out = self.model(x).transpose(1, 2).contiguous() + olens = ylens + return out * mask, olens + + def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50): + if x2.shape[1] > 40: + x2_head = paddle.nn.functional.interpolate( + x=x2[:, :20].transpose(1, 2).contiguous(), + size=int(20 / input_frame_rate * 22050 / 256), + mode="linear", + ) + x2_mid = paddle.nn.functional.interpolate( + x=x2[:, 20:-20].transpose(1, 2).contiguous(), + size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2, + mode="linear", + ) + x2_tail = paddle.nn.functional.interpolate( + x=x2[:, -20:].transpose(1, 2).contiguous(), + size=int(20 / input_frame_rate * 22050 / 256), + mode="linear", + ) + x2 = paddle.cat([x2_head, x2_mid, x2_tail], dim=2) + else: + x2 = paddle.nn.functional.interpolate( + x=x2.transpose(1, 2).contiguous(), size=mel_len2, mode="linear" + ) + if x1.shape[1] != 0: + x1 = paddle.nn.functional.interpolate( + x=x1.transpose(1, 2).contiguous(), size=mel_len1, mode="linear" + ) + x = paddle.cat([x1, x2], dim=2) + else: + x = x2 + out = self.model(x).transpose(1, 2).contiguous() + return out, mel_len1 + mel_len2 \ No newline at end of file diff --git a/paddlespeech/t2s/modules/tokenizer.py b/paddlespeech/t2s/modules/tokenizer.py new file mode 100644 index 0000000000..53ea044f51 --- /dev/null +++ b/paddlespeech/t2s/modules/tokenizer.py @@ -0,0 +1,241 @@ +import base64 +import os +from functools import lru_cache +from paddlenlp.transformers import AutoTokenizer +import paddle +import tiktoken + +LANGUAGES = { + "en": "english", + "zh": "chinese", + "de": "german", + "es": "spanish", + "ru": "russian", + "ko": "korean", + "fr": "french", + "ja": "japanese", + "pt": "portuguese", + "tr": "turkish", + "pl": "polish", + "ca": "catalan", + "nl": "dutch", + "ar": "arabic", + "sv": "swedish", + "it": "italian", + "id": "indonesian", + "hi": "hindi", + "fi": "finnish", + "vi": "vietnamese", + "he": "hebrew", + "uk": "ukrainian", + "el": "greek", + "ms": "malay", + "cs": "czech", + "ro": "romanian", + "da": "danish", + "hu": "hungarian", + "ta": "tamil", + "no": "norwegian", + "th": "thai", + "ur": "urdu", + "hr": "croatian", + "bg": "bulgarian", + "lt": "lithuanian", + "la": "latin", + "mi": "maori", + "ml": "malayalam", + "cy": "welsh", + "sk": "slovak", + "te": "telugu", + "fa": "persian", + "lv": "latvian", + "bn": "bengali", + "sr": "serbian", + "az": "azerbaijani", + "sl": "slovenian", + "kn": "kannada", + "et": "estonian", + "mk": "macedonian", + "br": "breton", + "eu": "basque", + "is": "icelandic", + "hy": "armenian", + "ne": "nepali", + "mn": "mongolian", + "bs": "bosnian", + "kk": "kazakh", + "sq": "albanian", + "sw": "swahili", + "gl": "galician", + "mr": "marathi", + "pa": "punjabi", + "si": "sinhala", + "km": "khmer", + "sn": "shona", + "yo": "yoruba", + "so": "somali", + "af": "afrikaans", + "oc": "occitan", + "ka": "georgian", + "be": "belarusian", + "tg": "tajik", + "sd": "sindhi", + "gu": "gujarati", + "am": "amharic", + "yi": "yiddish", + "lo": "lao", + "uz": "uzbek", + "fo": "faroese", + "ht": "haitian creole", + "ps": "pashto", + "tk": "turkmen", + "nn": "nynorsk", + "mt": "maltese", + "sa": "sanskrit", + "lb": "luxembourgish", + "my": "myanmar", + "bo": "tibetan", + "tl": "tagalog", + "mg": "malagasy", + "as": "assamese", + "tt": "tatar", + "haw": "hawaiian", + "ln": "lingala", + "ha": "hausa", + "ba": "bashkir", + "jw": "javanese", + "su": "sundanese", + "yue": "cantonese", + "minnan": "minnan", + "wuyu": "wuyu", + "dialect": "dialect", + "zh/en": "zh/en", + "en/zh": "en/zh", +} +TO_LANGUAGE_CODE = { + **{language: code for code, language in LANGUAGES.items()}, + "burmese": "my", + "valencian": "ca", + "flemish": "nl", + "haitian": "ht", + "letzeburgesch": "lb", + "pushto": "ps", + "panjabi": "pa", + "moldavian": "ro", + "moldovan": "ro", + "sinhalese": "si", + "castilian": "es", + "mandarin": "zh", +} +AUDIO_EVENT = { + "ASR": "ASR", + "AED": "AED", + "SER": "SER", + "Speech": "Speech", + "/Speech": "/Speech", + "BGM": "BGM", + "/BGM": "/BGM", + "Laughter": "Laughter", + "/Laughter": "/Laughter", + "Applause": "Applause", + "/Applause": "/Applause", +} +EMOTION = {"HAPPY": "HAPPY", "SAD": "SAD", "ANGRY": "ANGRY", "NEUTRAL": "NEUTRAL"} +TTS_Vocal_Token = { + "TTS/B": "TTS/B", + "TTS/O": "TTS/O", + "TTS/Q": "TTS/Q", + "TTS/A": "TTS/A", + "TTS/CO": "TTS/CO", + "TTS/CL": "TTS/CL", + "TTS/H": "TTS/H", + **{f"TTS/SP{i:02d}": f"TTS/SP{i:02d}" for i in range(1, 14)}, +} + + +@lru_cache(maxsize=None) +def get_encoding(name: str = "gpt2", num_languages: int = 99): + vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken") + ranks = { + base64.b64decode(token): int(rank) + for token, rank in (line.split() for line in open(vocab_path) if line) + } + n_vocab = len(ranks) + special_tokens = {} + specials = [ + "<|endoftext|>", + "<|startoftranscript|>", + *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]], + *[f"<|{audio_event}|>" for audio_event in list(AUDIO_EVENT.keys())], + *[f"<|{emotion}|>" for emotion in list(EMOTION.keys())], + "<|translate|>", + "<|transcribe|>", + "<|startoflm|>", + "<|startofprev|>", + "<|nospeech|>", + "<|notimestamps|>", + *[f"<|SPECIAL_TOKEN_{i}|>" for i in range(1, 31)], + *[f"<|{tts}|>" for tts in list(TTS_Vocal_Token.keys())], + *[f"<|{i * 0.02:.2f}|>" for i in range(1501)], + ] + for token in specials: + special_tokens[token] = n_vocab + n_vocab += 1 + return tiktoken.Encoding( + name=os.path.basename(vocab_path), + explicit_n_vocab=n_vocab, + pat_str="'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + mergeable_ranks=ranks, + special_tokens=special_tokens, + ) + + +class QwenTokenizer: + def __init__(self, skip_special_tokens=True): + super().__init__() + special_tokens = { + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|endofprompt|>", + "[breath]", + "", + "", + "[noise]", + "[laughter]", + "[cough]", + "[clucking]", + "[accent]", + "[quick_breath]", + "", + "", + "[hissing]", + "[sigh]", + "[vocalized-noise]", + "[lipsmack]", + "[mn]", + ], + } + self.special_tokens = special_tokens + self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B") + self.tokenizer.add_special_tokens(special_tokens) + self.skip_special_tokens = skip_special_tokens + + def encode(self, text, **kwargs): + tokens = self.tokenizer([text], return_tensors="pd") + tokens = tokens["input_ids"][0].cpu().tolist() + return tokens + + def decode(self, tokens): + tokens = paddle.tensor(tokens, dtype=paddle.int64) + text = self.tokenizer.batch_decode( + [tokens], skip_special_tokens=self.skip_special_tokens + )[0] + return text + + +@lru_cache(maxsize=None) +def get_qwen_tokenizer(skip_special_tokens: bool) -> QwenTokenizer: + return QwenTokenizer(skip_special_tokens=skip_special_tokens) diff --git a/q.pdparams b/q.pdparams new file mode 100644 index 0000000000..731d141199 Binary files /dev/null and b/q.pdparams differ diff --git a/q.pt b/q.pt new file mode 100644 index 0000000000..4d315c18cf Binary files /dev/null and b/q.pt differ