diff --git a/Linear_test.py b/Linear_test.py
new file mode 100644
index 0000000000..0ee1d20836
--- /dev/null
+++ b/Linear_test.py
@@ -0,0 +1,11 @@
+import paddle,torch,numpy
+torch_linear = torch.load("q.pt").cpu()
+paddle_linear_state = paddle.load("q.pdparams")
+paddle_linear = paddle.nn.Linear(896,896,bias_attr=True)
+hidden_states = paddle.load("hidden_states.pdparams")
+paddle_linear.set_state_dict(paddle_linear_state)
+torch_forward_res = torch_linear(torch.tensor(hidden_states.numpy()))
+paddle_forward_res = paddle_linear(hidden_states)
+print("torch_forward_res:",torch_forward_res)
+print("paddle_forward_res:",paddle_forward_res)
+print('allclose_res:',numpy.testing.assert_allclose(torch_forward_res.detach().numpy(),paddle_forward_res))
\ No newline at end of file
diff --git a/hidden_states.pdparams b/hidden_states.pdparams
new file mode 100644
index 0000000000..94c37f8b5a
Binary files /dev/null and b/hidden_states.pdparams differ
diff --git a/paddlespeech/cli/tts/cosyvoice.py b/paddlespeech/cli/tts/cosyvoice.py
new file mode 100644
index 0000000000..7ebe4dc806
--- /dev/null
+++ b/paddlespeech/cli/tts/cosyvoice.py
@@ -0,0 +1,34 @@
+from paddlespeech.t2s.models.CosyVoice.cosyvoice import CosyVoice2
+import sys
+from paddlenlp.transformers import AutoTokenizer, AutoModelForCausalLM
+from pathlib import Path
+import paddle
+import torch
+from paddlespeech.t2s.models.CosyVoice.llm import Qwen2LM,ras_sampling,Qwen2Encoder
+# cosyvoice_model = CosyVoice2("../CosyVoice/pretrained_models/CosyVoice2-0.5B_paddle")
+model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2-0.5B')
+llm = Qwen2Encoder(model)
+qwen_lm = Qwen2LM(896,896,6561,llm,ras_sampling)
+state_dict = paddle.load("/root/paddlejob/workspace/zhangjinghong/CosyVoice/pretrained_models/CosyVoice2-0.5B_paddle/llm.pdparams")
+qwen_lm.set_state_dict(state_dict)
+new_dict = torch.load("data.pt")
+text = new_dict['text'] 
+text_len = new_dict['text_len']
+prompt_text = new_dict['prompt_text']
+prompt_text_len = new_dict['prompt_text_len']
+prompt_speech_token = new_dict['prompt_speech_token']
+prompt_speech_token_len = new_dict['prompt_speech_token_len']
+embedding = new_dict['embedding']
+uuid = new_dict['uuid']
+print("text:",text)
+# for i in qwen_lm.inference(text=paddle.to_tensor(text),
+#     text_len=text_len,
+#     prompt_text=paddle.to_tensor(prompt_text),
+#     prompt_text_len=prompt_text_len,
+#     prompt_speech_token=paddle.to_tensor(prompt_speech_token),
+#     prompt_speech_token_len=prompt_speech_token_len,
+#     embedding=paddle.to_tensor(embedding,dtype = 'float32'),
+#     uuid=uuid):
+#     print(text)
+#     print(i)
+
diff --git a/paddlespeech/t2s/models/CosyVoice/__init__.py b/paddlespeech/t2s/models/CosyVoice/__init__.py
new file mode 100644
index 0000000000..e0b064387d
--- /dev/null
+++ b/paddlespeech/t2s/models/CosyVoice/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .cosyvoice import *
diff --git a/paddlespeech/t2s/models/CosyVoice/cosyvoice.py b/paddlespeech/t2s/models/CosyVoice/cosyvoice.py
new file mode 100644
index 0000000000..7f352f0e26
--- /dev/null
+++ b/paddlespeech/t2s/models/CosyVoice/cosyvoice.py
@@ -0,0 +1,360 @@
+import os
+import time
+from typing import Generator
+
+import paddle
+from hyperpyyaml import load_hyperpyyaml
+from modelscope import snapshot_download
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+logging.basicConfig(level=logging.DEBUG,
+                    format='%(asctime)s %(levelname)s %(message)s')
+from paddlespeech.t2s.models.CosyVoice.frontend import CosyVoiceFrontEnd
+from paddlespeech.t2s.models.CosyVoice.model import CosyVoice2Model
+
+def get_model_type(configs):
+    # NOTE CosyVoice2Model inherits CosyVoiceModel
+    if isinstance(configs['llm'], TransformerLM) and isinstance(configs['flow'], MaskedDiffWithXvec) and isinstance(configs['hift'], HiFTGenerator):
+        return CosyVoiceModel
+    if isinstance(configs['llm'], Qwen2LM) and isinstance(configs['flow'], CausalMaskedDiffWithXvec) and isinstance(configs['hift'], HiFTGenerator):
+        return CosyVoice2Model
+    raise TypeError('No valid model type found!')
+class CosyVoice:
+    def __init__(
+        self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1
+    ):
+        self.instruct = True if "-Instruct" in model_dir else False
+        self.model_dir = model_dir
+        self.fp16 = fp16
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        hyper_yaml_path = "{}/cosyvoice.yaml".format(model_dir)
+        if not os.path.exists(hyper_yaml_path):
+            raise ValueError("{} not found!".format(hyper_yaml_path))
+        with open(hyper_yaml_path, "r") as f:
+            configs = load_hyperpyyaml(f)
+        # assert (
+        #     get_model_type(configs) != CosyVoice2Model
+        # ), "do not use {} for CosyVoice initialization!".format(model_dir)
+        self.frontend = CosyVoiceFrontEnd(
+            configs["get_tokenizer"],
+            configs["feat_extractor"],
+            "{}/campplus.onnx".format(model_dir),
+            "{}/speech_tokenizer_v1.onnx".format(model_dir),
+            "{}/spk2info.pt".format(model_dir),
+            configs["allowed_special"],
+        )
+        self.sample_rate = configs["sample_rate"]
+        if (paddle.device.cuda.device_count() >= 1) is False and (
+            load_jit is True or load_trt is True or fp16 is True
+        ):
+            load_jit, load_trt, fp16 = False, False, False
+            logging.warning("no cuda device, set load_jit/load_trt/fp16 to False")
+        self.model = CosyVoiceModel(
+            configs["llm"], configs["flow"], configs["hift"], fp16
+        )
+        self.model.load(
+            "{}/llm.pt".format(model_dir),
+            "{}/flow.pt".format(model_dir),
+            "{}/hift.pt".format(model_dir),
+        )
+        if load_jit:
+            self.model.load_jit(
+                "{}/llm.text_encoder.{}.zip".format(
+                    model_dir, "fp16" if self.fp16 is True else "fp32"
+                ),
+                "{}/llm.llm.{}.zip".format(
+                    model_dir, "fp16" if self.fp16 is True else "fp32"
+                ),
+                "{}/flow.encoder.{}.zip".format(
+                    model_dir, "fp16" if self.fp16 is True else "fp32"
+                ),
+            )
+        if load_trt:
+            self.model.load_trt(
+                "{}/flow.decoder.estimator.{}.mygpu.plan".format(
+                    model_dir, "fp16" if self.fp16 is True else "fp32"
+                ),
+                "{}/flow.decoder.estimator.fp32.onnx".format(model_dir),
+                trt_concurrent,
+                self.fp16,
+            )
+        del configs
+
+    def list_available_spks(self):
+        spks = list(self.frontend.spk2info.keys())
+        return spks
+
+    def add_zero_shot_spk(self, prompt_text, prompt_speech_16k, zero_shot_spk_id):
+        assert zero_shot_spk_id != "", "do not use empty zero_shot_spk_id"
+        model_input = self.frontend.frontend_zero_shot(
+            "", prompt_text, prompt_speech_16k, self.sample_rate, ""
+        )
+        del model_input["text"]
+        del model_input["text_len"]
+        self.frontend.spk2info[zero_shot_spk_id] = model_input
+        return True
+
+    def save_spkinfo(self):
+        paddle.save(
+            obj=self.frontend.spk2info, path="{}/spk2info.pt".format(self.model_dir)
+        )
+
+    def inference_sft(
+        self, tts_text, spk_id, stream=False, speed=1.0, text_frontend=True
+    ):
+        for i in tqdm(
+            self.frontend.text_normalize(
+                tts_text, split=True, text_frontend=text_frontend
+            )
+        ):
+            model_input = self.frontend.frontend_sft(i, spk_id)
+            start_time = time.time()
+            logging.info("synthesis text {}".format(i))
+            for model_output in self.model.tts(
+                **model_input, stream=stream, speed=speed
+            ):
+                speech_len = model_output["tts_speech"].shape[1] / self.sample_rate
+                logging.info(
+                    "yield speech len {}, rtf {}".format(
+                        speech_len, (time.time() - start_time) / speech_len
+                    )
+                )
+                yield model_output
+                start_time = time.time()
+
+    def inference_zero_shot(
+        self,
+        tts_text,
+        prompt_text,
+        prompt_speech_16k,
+        zero_shot_spk_id="",
+        stream=False,
+        speed=1.0,
+        text_frontend=True,
+    ):
+        prompt_text = self.frontend.text_normalize(
+            prompt_text, split=False, text_frontend=text_frontend
+        )
+        for i in tqdm(
+            self.frontend.text_normalize(
+                tts_text, split=True, text_frontend=text_frontend
+            )
+        ):
+            if not isinstance(i, Generator) and len(i) < 0.5 * len(prompt_text):
+                logging.warning(
+                    "synthesis text {} too short than prompt text {}, this may lead to bad performance".format(
+                        i, prompt_text
+                    )
+                )
+            model_input = self.frontend.frontend_zero_shot(
+                i, prompt_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id
+            )
+            start_time = time.time()
+            logging.info("synthesis text {}".format(i))
+            for model_output in self.model.tts(
+                **model_input, stream=stream, speed=speed
+            ):
+                speech_len = model_output["tts_speech"].shape[1] / self.sample_rate
+                logging.info(
+                    "yield speech len {}, rtf {}".format(
+                        speech_len, (time.time() - start_time) / speech_len
+                    )
+                )
+                yield model_output
+                start_time = time.time()
+
+    def inference_cross_lingual(
+        self,
+        tts_text,
+        prompt_speech_16k,
+        zero_shot_spk_id="",
+        stream=False,
+        speed=1.0,
+        text_frontend=True,
+    ):
+        for i in tqdm(
+            self.frontend.text_normalize(
+                tts_text, split=True, text_frontend=text_frontend
+            )
+        ):
+            model_input = self.frontend.frontend_cross_lingual(
+                i, prompt_speech_16k, self.sample_rate, zero_shot_spk_id
+            )
+            start_time = time.time()
+            logging.info("synthesis text {}".format(i))
+            for model_output in self.model.tts(
+                **model_input, stream=stream, speed=speed
+            ):
+                speech_len = model_output["tts_speech"].shape[1] / self.sample_rate
+                logging.info(
+                    "yield speech len {}, rtf {}".format(
+                        speech_len, (time.time() - start_time) / speech_len
+                    )
+                )
+                yield model_output
+                start_time = time.time()
+
+    def inference_instruct(
+        self,
+        tts_text,
+        spk_id,
+        instruct_text,
+        stream=False,
+        speed=1.0,
+        text_frontend=True,
+    ):
+        assert isinstance(
+            self.model, CosyVoiceModel
+        ), "inference_instruct is only implemented for CosyVoice!"
+        if self.instruct is False:
+            raise ValueError(
+                "{} do not support instruct inference".format(self.model_dir)
+            )
+        instruct_text = self.frontend.text_normalize(
+            instruct_text, split=False, text_frontend=text_frontend
+        )
+        for i in tqdm(
+            self.frontend.text_normalize(
+                tts_text, split=True, text_frontend=text_frontend
+            )
+        ):
+            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            start_time = time.time()
+            logging.info("synthesis text {}".format(i))
+            for model_output in self.model.tts(
+                **model_input, stream=stream, speed=speed
+            ):
+                speech_len = model_output["tts_speech"].shape[1] / self.sample_rate
+                logging.info(
+                    "yield speech len {}, rtf {}".format(
+                        speech_len, (time.time() - start_time) / speech_len
+                    )
+                )
+                yield model_output
+                start_time = time.time()
+
+    def inference_vc(
+        self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0
+    ):
+        model_input = self.frontend.frontend_vc(
+            source_speech_16k, prompt_speech_16k, self.sample_rate
+        )
+        start_time = time.time()
+        for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+            speech_len = model_output["tts_speech"].shape[1] / self.sample_rate
+            logging.info(
+                "yield speech len {}, rtf {}".format(
+                    speech_len, (time.time() - start_time) / speech_len
+                )
+            )
+            yield model_output
+            start_time = time.time()
+
+
+class CosyVoice2(CosyVoice):
+    def __init__(
+        self,
+        model_dir,
+        load_jit=False,
+        load_trt=False,
+        load_vllm=False,
+        fp16=False,
+        trt_concurrent=1,
+    ):
+        self.instruct = True if "-Instruct" in model_dir else False
+        self.model_dir = model_dir
+        self.fp16 = fp16
+        hyper_yaml_path = "{}/cosyvoice2.yaml".format(model_dir)
+        if not os.path.exists(hyper_yaml_path):
+            raise ValueError("{} not found!".format(hyper_yaml_path))
+        with open(hyper_yaml_path, "r") as f:
+            configs = load_hyperpyyaml(
+                f,
+                overrides={
+                    "qwen_pretrain_path": os.path.join(model_dir, "CosyVoice-BlankEN")
+                },
+            )
+        # assert (
+        #     get_model_type(configs) == CosyVoice2Model
+        # ), "do not use {} for CosyVoice2 initialization!".format(model_dir)
+        self.frontend = CosyVoiceFrontEnd(
+            configs["get_tokenizer"],
+            configs["feat_extractor"],
+            "{}/campplus.onnx".format(model_dir),
+            "{}/speech_tokenizer_v2.onnx".format(model_dir),
+            "{}/spk2info.pt".format(model_dir),
+            configs["allowed_special"],
+        )
+        self.sample_rate = configs["sample_rate"]
+        if (paddle.device.cuda.device_count() >= 1) is False and (
+            load_jit is True or load_trt is True or fp16 is True
+        ):
+            load_jit, load_trt, fp16 = False, False, False
+            logging.warning("no cuda device, set load_jit/load_trt/fp16 to False")
+        self.model = CosyVoice2Model(
+            configs["llm"], configs["flow"], configs["hift"], fp16
+        )
+        self.model.load(
+            "{}/llm.pt".format(model_dir),
+            "{}/flow.pt".format(model_dir),
+            "{}/hift.pt".format(model_dir),
+        )
+        if load_vllm:
+            self.model.load_vllm("{}/vllm".format(model_dir))
+        if load_jit:
+            self.model.load_jit(
+                "{}/flow.encoder.{}.zip".format(
+                    model_dir, "fp16" if self.fp16 is True else "fp32"
+                )
+            )
+        if load_trt:
+            self.model.load_trt(
+                "{}/flow.decoder.estimator.{}.mygpu.plan".format(
+                    model_dir, "fp16" if self.fp16 is True else "fp32"
+                ),
+                "{}/flow.decoder.estimator.fp32.onnx".format(model_dir),
+                trt_concurrent,
+                self.fp16,
+            )
+        del configs
+
+    def inference_instruct(self, *args, **kwargs):
+        raise NotImplementedError(
+            "inference_instruct is not implemented for CosyVoice2!"
+        )
+
+    def inference_instruct2(
+        self,
+        tts_text,
+        instruct_text,
+        prompt_speech_16k,
+        zero_shot_spk_id="",
+        stream=False,
+        speed=1.0,
+        text_frontend=True,
+    ):
+        assert isinstance(
+            self.model, CosyVoice2Model
+        ), "inference_instruct2 is only implemented for CosyVoice2!"
+        for i in tqdm(
+            self.frontend.text_normalize(
+                tts_text, split=True, text_frontend=text_frontend
+            )
+        ):
+            model_input = self.frontend.frontend_instruct2(
+                i, instruct_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id
+            )
+            start_time = time.time()
+            logging.info("synthesis text {}".format(i))
+            for model_output in self.model.tts(
+                **model_input, stream=stream, speed=speed
+            ):
+                speech_len = model_output["tts_speech"].shape[1] / self.sample_rate
+                logging.info(
+                    "yield speech len {}, rtf {}".format(
+                        speech_len, (time.time() - start_time) / speech_len
+                    )
+                )
+                yield model_output
+                start_time = time.time()
diff --git a/paddlespeech/t2s/models/CosyVoice/flow.py b/paddlespeech/t2s/models/CosyVoice/flow.py
new file mode 100644
index 0000000000..f594cb18b3
--- /dev/null
+++ b/paddlespeech/t2s/models/CosyVoice/flow.py
@@ -0,0 +1,253 @@
+import math
+from typing import Any
+from typing import Dict
+from typing import List
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+class Decoder(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+        down_block_type="transformer",
+        mid_block_type="transformer",
+        up_block_type="transformer",
+    ):
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    self.get_block(
+                        down_block_type,
+                        output_channel,
+                        attention_head_dim,
+                        num_heads,
+                        dropout,
+                        act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+
+        for i in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+
+            transformer_blocks = nn.ModuleList(
+                [
+                    self.get_block(
+                        mid_block_type,
+                        output_channel,
+                        attention_head_dim,
+                        num_heads,
+                        dropout,
+                        act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i]
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+
+            resnet = ResnetBlock1D(
+                dim=2 * input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    self.get_block(
+                        up_block_type,
+                        output_channel,
+                        attention_head_dim,
+                        num_heads,
+                        dropout,
+                        act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+
+        self.final_block = Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+
+        self.initialize_weights()
+        # nn.init.normal_(self.final_proj.weight)
+
+    @staticmethod
+    def get_block(block_type, dim, attention_head_dim, num_heads, dropout, act_fn):
+        if block_type == "conformer":
+            block = ConformerWrapper(
+                dim=dim,
+                dim_head=attention_head_dim,
+                heads=num_heads,
+                ff_mult=1,
+                conv_expansion_factor=2,
+                ff_dropout=dropout,
+                attn_dropout=dropout,
+                conv_dropout=dropout,
+                conv_kernel_size=31,
+            )
+        elif block_type == "transformer":
+            block = BasicTransformerBlock(
+                dim=dim,
+                num_attention_heads=num_heads,
+                attention_head_dim=attention_head_dim,
+                dropout=dropout,
+                activation_fn=act_fn,
+            )
+        else:
+            raise ValueError(f"Unknown block type {block_type}")
+
+        return block
+
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+
+        Returns:
+            _type_: _description_
+        """
+
+        t = self.time_embeddings(t)
+        t = self.time_mlp(t)
+
+        x = pack([x, mu], "b * t")[0]
+
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c")
+            mask_down = rearrange(mask_down, "b 1 t -> b t")
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=mask_down,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t")
+            mask_down = rearrange(mask_down, "b t -> b 1 t")
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c")
+            mask_mid = rearrange(mask_mid, "b 1 t -> b t")
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=mask_mid,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t")
+            mask_mid = rearrange(mask_mid, "b t -> b 1 t")
+
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            x = resnet(pack([x, hiddens.pop()], "b * t")[0], mask_up, t)
+            x = rearrange(x, "b c t -> b t c")
+            mask_up = rearrange(mask_up, "b 1 t -> b t")
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=mask_up,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t")
+            mask_up = rearrange(mask_up, "b t -> b 1 t")
+            x = upsample(x * mask_up)
+
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+
+        return output * mask
diff --git a/paddlespeech/t2s/models/CosyVoice/frontend.py b/paddlespeech/t2s/models/CosyVoice/frontend.py
new file mode 100644
index 0000000000..4fcbf6f172
--- /dev/null
+++ b/paddlespeech/t2s/models/CosyVoice/frontend.py
@@ -0,0 +1,448 @@
+import json
+import os
+import re
+from functools import partial
+from typing import Callable, Generator
+
+import inflect
+import numpy as np
+import onnxruntime
+import paddle
+import paddlespeech
+import whisper
+import logging
+try:
+    import ttsfrd
+
+    use_ttsfrd = True
+except ImportError:
+    print("failed to import ttsfrd, use wetext instead")
+    from wetext import Normalizer as EnNormalizer
+    from wetext import Normalizer as ZhNormalizer
+
+    use_ttsfrd = False
+# split paragrah logic：
+# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
+# 2. cal sentence len according to lang
+# 3. split sentence according to puncatation
+def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
+    def calc_utt_length(_text: str):
+        if lang == "zh":
+            return len(_text)
+        else:
+            return len(tokenize(_text))
+
+    def should_merge(_text: str):
+        if lang == "zh":
+            return len(_text) < merge_len
+        else:
+            return len(tokenize(_text)) < merge_len
+
+    if lang == "zh":
+        pounc = ['。', '？', '！', '；', '：', '、', '.', '?', '!', ';']
+    else:
+        pounc = ['.', '?', '!', ';', ':']
+    if comma_split:
+        pounc.extend(['，', ','])
+
+    if text[-1] not in pounc:
+        if lang == "zh":
+            text += "。"
+        else:
+            text += "."
+
+    st = 0
+    utts = []
+    for i, c in enumerate(text):
+        if c in pounc:
+            if len(text[st: i]) > 0:
+                utts.append(text[st: i] + c)
+            if i + 1 < len(text) and text[i + 1] in ['"', '”']:
+                tmp = utts.pop(-1)
+                utts.append(tmp + text[i + 1])
+                st = i + 2
+            else:
+                st = i + 1
+
+    final_utts = []
+    cur_utt = ""
+    for utt in utts:
+        if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
+            final_utts.append(cur_utt)
+            cur_utt = ""
+        cur_utt = cur_utt + utt
+    if len(cur_utt) > 0:
+        if should_merge(cur_utt) and len(final_utts) != 0:
+            final_utts[-1] = final_utts[-1] + cur_utt
+        else:
+            final_utts.append(cur_utt)
+
+    return final_utts
+
+# spell Arabic numerals
+def spell_out_number(text: str, inflect_parser):
+    new_text = []
+    st = None
+    for i, c in enumerate(text):
+        if not c.isdigit():
+            if st is not None:
+                num_str = inflect_parser.number_to_words(text[st: i])
+                new_text.append(num_str)
+                st = None
+            new_text.append(c)
+        else:
+            if st is None:
+                st = i
+    if st is not None and st < len(text):
+        num_str = inflect_parser.number_to_words(text[st:])
+        new_text.append(num_str)
+    return ''.join(new_text)
+
+# replace special symbol
+def replace_corner_mark(text):
+    text = text.replace('²', '平方')
+    text = text.replace('³', '立方')
+    return text
+
+# remove blank between chinese character
+def replace_blank(text: str):
+    out_str = []
+    for i, c in enumerate(text):
+        if c == " ":
+            if ((text[i + 1].isascii() and text[i + 1] != " ") and
+                    (text[i - 1].isascii() and text[i - 1] != " ")):
+                out_str.append(c)
+        else:
+            out_str.append(c)
+    return "".join(out_str)
+def is_only_punctuation(text):
+    # Regular expression: Match strings that consist only of punctuation marks or are empty.
+    punctuation_pattern = r'^[\p{P}\p{S}]*$'
+    return bool(regex.fullmatch(punctuation_pattern, text))
+
+# remove meaningless symbol
+def remove_bracket(text):
+    text = text.replace('（', '').replace('）', '')
+    text = text.replace('【', '').replace('】', '')
+    text = text.replace('`', '').replace('`', '')
+    text = text.replace("——", " ")
+    return text
+class CosyVoiceFrontEnd:
+    def __init__(
+        self,
+        get_tokenizer: Callable,
+        feat_extractor: Callable,
+        campplus_model: str,
+        speech_tokenizer_model: str,
+        spk2info: str = "",
+        allowed_special: str = "all",
+    ):
+        self.tokenizer = get_tokenizer()
+        self.feat_extractor = feat_extractor
+        self.device = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = (
+            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        )
+        option.intra_op_num_threads = 1
+        self.campplus_session = onnxruntime.InferenceSession(
+            campplus_model, sess_options=option, providers=["CPUExecutionProvider"]
+        )
+        self.speech_tokenizer_session = onnxruntime.InferenceSession(
+            speech_tokenizer_model,
+            sess_options=option,
+            providers=[
+                "CUDAExecutionProvider"
+                if paddle.device.cuda.device_count() >= 1
+                else "CPUExecutionProvider"
+            ],
+        )
+        if os.path.exists(spk2info):
+            self.spk2info = paddle.load(path=str(spk2info))
+        else:
+            self.spk2info = {}
+        self.allowed_special = allowed_special
+        self.use_ttsfrd = use_ttsfrd
+        if self.use_ttsfrd:
+            self.frd = ttsfrd.TtsFrontendEngine()
+            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+            assert (
+                self.frd.initialize(
+                    "{}/../../pretrained_models/CosyVoice-ttsfrd/resource".format(
+                        ROOT_DIR
+                    )
+                )
+                is True
+            ), "failed to initialize ttsfrd resource"
+            self.frd.set_lang_type("pinyinvg")
+        else:
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False)
+            self.en_tn_model = EnNormalizer()
+            self.inflect_parser = inflect.engine()
+
+    def _extract_text_token(self, text):
+        if isinstance(text, Generator):
+            logging.info(
+                "get tts_text generator, will return _extract_text_token_generator!"
+            )
+            return self._extract_text_token_generator(text), paddle.tensor(
+                [0], dtype=paddle.int32
+            ).to(self.device)
+        else:
+            text_token = self.tokenizer.encode(
+                text, allowed_special=self.allowed_special
+            )
+            text_token = paddle.tensor([text_token], dtype=paddle.int32).to(self.device)
+            text_token_len = paddle.tensor(
+                [text_token.shape[1]], dtype=paddle.int32
+            ).to(self.device)
+            return text_token, text_token_len
+
+    def _extract_text_token_generator(self, text_generator):
+        for text in text_generator:
+            text_token, _ = self._extract_text_token(text)
+            for i in range(text_token.shape[1]):
+                yield text_token[:, i : i + 1]
+
+    def _extract_speech_token(self, speech):
+        assert (
+            speech.shape[1] / 16000 <= 30
+        ), "do not support extract speech token for audio longer than 30s"
+        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
+        speech_token = (
+            self.speech_tokenizer_session.run(
+                None,
+                {
+                    self.speech_tokenizer_session.get_inputs()[0]
+                    .name: feat.detach()
+                    .cpu()
+                    .numpy(),
+                    self.speech_tokenizer_session.get_inputs()[1].name: np.array(
+                        [feat.shape[2]], dtype=np.int32
+                    ),
+                },
+            )[0]
+            .flatten()
+            .tolist()
+        )
+        speech_token = paddle.tensor([speech_token], dtype=paddle.int32).to(self.device)
+        speech_token_len = paddle.tensor(
+            [speech_token.shape[1]], dtype=paddle.int32
+        ).to(self.device)
+        return speech_token, speech_token_len
+
+    def _extract_spk_embedding(self, speech):
+        ##################>>>>>>>>>>>>>>>>>>> 
+        feat = torchaudio.compliance.kaldi.fbank(
+            speech, num_mel_bins=80, dither=0, sample_frequency=16000
+        )
+        ##################>>>>>>>>>>>>>>>>>>> 
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = (
+            self.campplus_session.run(
+                None,
+                {
+                    self.campplus_session.get_inputs()[0]
+                    .name: feat.unsqueeze(dim=0)
+                    .cpu()
+                    .numpy()
+                },
+            )[0]
+            .flatten()
+            .tolist()
+        )
+        embedding = paddle.tensor([embedding]).to(self.device)
+        return embedding
+
+    def _extract_speech_feat(self, speech):
+        speech_feat = (
+            self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
+        )
+        speech_feat = speech_feat.unsqueeze(dim=0)
+        speech_feat_len = paddle.tensor([speech_feat.shape[1]], dtype=paddle.int32).to(
+            self.device
+        )
+        return speech_feat, speech_feat_len
+
+    def text_normalize(self, text, split=True, text_frontend=True):
+        if isinstance(text, Generator):
+            logging.info("get tts_text generator, will skip text_normalize!")
+            return [text]
+        if text_frontend is False or text == "":
+            return [text] if split is True else text
+        text = text.strip()
+        if self.use_ttsfrd:
+            texts = [
+                i["text"]
+                for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]
+            ]
+            text = "".join(texts)
+        elif contains_chinese(text):
+            text = self.zh_tn_model.normalize(text)
+            text = text.replace("\n", "")
+            text = replace_blank(text)
+            text = replace_corner_mark(text)
+            text = text.replace(".", "。")
+            text = text.replace(" - ", "，")
+            text = remove_bracket(text)
+            text = re.sub("[，,、]+$", "。", text)
+            texts = list(
+                split_paragraph(
+                    text,
+                    partial(
+                        self.tokenizer.encode, allowed_special=self.allowed_special
+                    ),
+                    "zh",
+                    token_max_n=80,
+                    token_min_n=60,
+                    merge_len=20,
+                    comma_split=False,
+                )
+            )
+        else:
+            text = self.en_tn_model.normalize(text)
+            text = spell_out_number(text, self.inflect_parser)
+            texts = list(
+                split_paragraph(
+                    text,
+                    partial(
+                        self.tokenizer.encode, allowed_special=self.allowed_special
+                    ),
+                    "en",
+                    token_max_n=80,
+                    token_min_n=60,
+                    merge_len=20,
+                    comma_split=False,
+                )
+            )
+        texts = [i for i in texts if not is_only_punctuation(i)]
+        return texts if split is True else text
+
+    def frontend_sft(self, tts_text, spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        print("1" * 30)
+        print(self.spk2info.keys())
+        embedding = self.spk2info[spk_id]["embedding"]
+        model_input = {
+            "text": tts_text_token,
+            "text_len": tts_text_token_len,
+            "llm_embedding": embedding,
+            "flow_embedding": embedding,
+        }
+        return model_input
+
+    def frontend_zero_shot(
+        self, tts_text, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id
+    ):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        if zero_shot_spk_id == "":
+            prompt_text_token, prompt_text_token_len = self._extract_text_token(
+                prompt_text
+            )
+            #>>>>>>>>>>>>>>>>>>> 
+            prompt_speech_resample = torchaudio.transforms.Resample(
+                orig_freq=16000, new_freq=resample_rate
+            )(prompt_speech_16k)
+            #>>>>>>>>>>>>>>>>>>>
+            speech_feat, speech_feat_len = self._extract_speech_feat(
+                prompt_speech_resample
+            )
+            speech_token, speech_token_len = self._extract_speech_token(
+                prompt_speech_16k
+            )
+            if resample_rate == 24000:
+                token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+                speech_feat, speech_feat_len[:] = (
+                    speech_feat[:, : 2 * token_len],
+                    2 * token_len,
+                )
+                speech_token, speech_token_len[:] = (
+                    speech_token[:, :token_len],
+                    token_len,
+                )
+            embedding = self._extract_spk_embedding(prompt_speech_16k)
+            model_input = {
+                "prompt_text": prompt_text_token,
+                "prompt_text_len": prompt_text_token_len,
+                "llm_prompt_speech_token": speech_token,
+                "llm_prompt_speech_token_len": speech_token_len,
+                "flow_prompt_speech_token": speech_token,
+                "flow_prompt_speech_token_len": speech_token_len,
+                "prompt_speech_feat": speech_feat,
+                "prompt_speech_feat_len": speech_feat_len,
+                "llm_embedding": embedding,
+                "flow_embedding": embedding,
+            }
+        else:
+            model_input = self.spk2info[zero_shot_spk_id]
+        model_input["text"] = tts_text_token
+        model_input["text_len"] = tts_text_token_len
+        return model_input
+
+    def frontend_cross_lingual(
+        self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id
+    ):
+        model_input = self.frontend_zero_shot(
+            tts_text, "", prompt_speech_16k, resample_rate, zero_shot_spk_id
+        )
+        del model_input["prompt_text"]
+        del model_input["prompt_text_len"]
+        del model_input["llm_prompt_speech_token"]
+        del model_input["llm_prompt_speech_token_len"]
+        return model_input
+
+    def frontend_instruct(self, tts_text, spk_id, instruct_text):
+        model_input = self.frontend_sft(tts_text, spk_id)
+        del model_input["llm_embedding"]
+        instruct_text_token, instruct_text_token_len = self._extract_text_token(
+            instruct_text + "<endofprompt>"
+        )
+        model_input["prompt_text"] = instruct_text_token
+        model_input["prompt_text_len"] = instruct_text_token_len
+        return model_input
+
+    def frontend_instruct2(
+        self,
+        tts_text,
+        instruct_text,
+        prompt_speech_16k,
+        resample_rate,
+        zero_shot_spk_id,
+    ):
+        model_input = self.frontend_zero_shot(
+            tts_text,
+            instruct_text + "<|endofprompt|>",
+            prompt_speech_16k,
+            resample_rate,
+            zero_shot_spk_id,
+        )
+        del model_input["llm_prompt_speech_token"]
+        del model_input["llm_prompt_speech_token_len"]
+        return model_input
+
+    def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):
+        prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(
+            prompt_speech_16k
+        )
+        #>>>>>>>>>>>>>>>>>>  
+        prompt_speech_resample = torchaudio.transforms.Resample(
+            orig_freq=16000, new_freq=resample_rate
+        )(prompt_speech_16k)
+        #>>>>>>>>>>>>>>>>>>
+        prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(
+            prompt_speech_resample
+        )
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        source_speech_token, source_speech_token_len = self._extract_speech_token(
+            source_speech_16k
+        )
+        model_input = {
+            "source_speech_token": source_speech_token,
+            "source_speech_token_len": source_speech_token_len,
+            "flow_prompt_speech_token": prompt_speech_token,
+            "flow_prompt_speech_token_len": prompt_speech_token_len,
+            "prompt_speech_feat": prompt_speech_feat,
+            "prompt_speech_feat_len": prompt_speech_feat_len,
+            "flow_embedding": embedding,
+        }
+        return model_input
\ No newline at end of file
diff --git a/paddlespeech/t2s/models/CosyVoice/llm.py b/paddlespeech/t2s/models/CosyVoice/llm.py
new file mode 100644
index 0000000000..6c509d9ab3
--- /dev/null
+++ b/paddlespeech/t2s/models/CosyVoice/llm.py
@@ -0,0 +1,737 @@
+import queue
+import random
+import threading
+import time
+from typing import Callable, Dict, Generator, List, Optional
+import logging
+import paddle.nn.functional as F
+import paddle
+IGNORE_ID = -1
+# from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
+# from cosyvoice.utils.common import IGNORE_ID, th_accuracy
+# from cosyvoice.utils.file_utils import logging
+# from cosyvoice.utils.mask import make_pad_mask
+import torch
+LabelSmoothingLoss = None
+def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1):
+    top_ids = nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k)
+    recent_tokens = paddle.to_tensor(decoded_tokens[-win_size:], dtype='int64')
+    rep_num = paddle.sum(recent_tokens.cpu() == top_ids.cpu()).cpu().item()
+    if rep_num >= win_size * tau_r:
+        top_ids = random_sampling(weighted_scores, decoded_tokens, sampling)
+    return top_ids
+
+
+def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25):
+    softmax_scores = paddle.nn.functional.softmax(weighted_scores, axis=0)
+    sorted_indices = paddle.argsort(softmax_scores, axis=0, descending=True)
+    sorted_probs = paddle.gather(softmax_scores, sorted_indices, axis=0)
+    
+    prob_list = []
+    indices_list = []
+    cum_prob = 0.0
+    
+    for i in range(len(sorted_indices)):
+        if cum_prob < top_p and len(prob_list) < top_k:
+            cum_prob += sorted_probs[i].item()
+            prob_list.append(sorted_probs[i])
+            indices_list.append(sorted_indices[i])
+        else:
+            break
+    
+    prob_tensor = paddle.to_tensor(prob_list, dtype=weighted_scores.dtype)
+    indices_tensor = paddle.to_tensor(indices_list, dtype='int64')
+    top_ids = indices_tensor[paddle.multinomial(prob_tensor, num_samples=1, replacement=True)]
+    
+    return top_ids
+
+
+def random_sampling(weighted_scores, decoded_tokens, sampling):
+    probs = paddle.nn.functional.softmax(weighted_scores, axis=0)
+    top_ids = paddle.multinomial(probs, num_samples=1, replacement=True)
+    return top_ids
+def make_pad_mask(lengths: paddle.Tensor, max_len: int = 0) -> paddle.Tensor:
+    batch_size = lengths.shape[0]
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = paddle.arange(0, max_len, dtype='int64') 
+    seq_range_expand = seq_range.unsqueeze(0).expand([batch_size, max_len])
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask
+
+def th_accuracy(pad_outputs: paddle.Tensor, pad_targets: paddle.Tensor,
+                ignore_label: int) -> paddle.Tensor:
+    pad_pred = pad_outputs.reshape((pad_targets.shape[0], pad_targets.shape[1], -1)).argmax(axis=2)
+    mask = pad_targets != ignore_label
+    numerator = paddle.sum((pad_pred[mask] == pad_targets[mask]).astype('float32'))
+    denominator = paddle.sum(mask.astype('float32'))
+    accuracy = numerator / denominator
+    
+    return accuracy.detach()
+class TransformerLM(paddle.nn.Layer):
+    def __init__(
+        self,
+        text_encoder_input_size: int,
+        llm_input_size: int,
+        llm_output_size: int,
+        text_token_size: int,
+        speech_token_size: int,
+        text_encoder: paddle.nn.Layer,
+        llm: paddle.nn.Layer,
+        sampling: Callable,
+        length_normalized_loss: bool = True,
+        lsm_weight: float = 0.0,
+        spk_embed_dim: int = 192,
+    ):
+        super().__init__()
+        self.llm_input_size = llm_input_size
+        self.speech_token_size = speech_token_size
+        self.text_embedding = paddle.nn.Embedding(
+            text_token_size, text_encoder_input_size
+        )
+        self.text_encoder = text_encoder
+        self.text_encoder_affine_layer = paddle.nn.Linear(
+            in_features=self.text_encoder.output_size(), out_features=llm_input_size
+        )
+        self.sos_eos = 0
+        self.task_id = 1
+        self.llm_embedding = paddle.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = paddle.nn.Linear(
+            in_features=llm_output_size, out_features=speech_token_size + 1
+        )
+        
+        self.criterion_ce = LabelSmoothingLoss(
+            size=speech_token_size + 1,
+            padding_idx=IGNORE_ID,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+        self.speech_embedding = paddle.nn.Embedding(speech_token_size, llm_input_size)
+        self.spk_embed_affine_layer = paddle.nn.Linear(
+            in_features=spk_embed_dim, out_features=llm_input_size
+        )
+        self.sampling = sampling
+
+    def encode(self, text: paddle.Tensor, text_lengths: paddle.Tensor):
+        encoder_out, encoder_mask = self.text_encoder(
+            text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1
+        )
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+        encoder_out = self.text_encoder_affine_layer(encoder_out)
+        return encoder_out, encoder_out_lens
+
+    def pad_unpad_sequence(
+        self,
+        sos_eos_emb,
+        embedding,
+        text_token,
+        text_token_len,
+        task_id_emb,
+        speech_token,
+        speech_token_len,
+    ):
+
+        text_token = paddle.static.nn.sequence_unpad(
+            text_token, text_token_len.cpu()
+        )
+        speech_token = paddle.static.nn.sequence_unpad(
+            speech_token, speech_token_len.cpu()
+        )
+        lm_input = [
+            paddle.cat(
+                [
+                    sos_eos_emb.squeeze(dim=0),
+                    embedding[i],
+                    text_token[i],
+                    task_id_emb.squeeze(dim=0),
+                    speech_token[i],
+                ],
+                dim=0,
+            )
+            for i in range(len(text_token))
+        ]
+        lm_input_len = paddle.tensor([i.size(0) for i in lm_input], dtype=paddle.int32)
+        lm_input = paddle.static.nn.sequence_unpad(
+            lm_input, batch_first=True, padding_value=IGNORE_ID
+        )
+        return lm_input, lm_input_len
+
+    def forward(
+        self, batch: dict, device: torch.device
+    ) -> Dict[str, Optional[paddle.Tensor]]:
+        """
+        Args:
+            text: (B, L, D)
+            text_lengths: (B,)
+            audio: (B, T, N) or (B, T)
+            audio_lengths: (B,)
+        """
+        text_token = batch["text_token"].to(device)
+        text_token_len = batch["text_token_len"].to(device)
+        speech_token = batch["speech_token"].to(device)
+        speech_token_len = batch["speech_token_len"].to(device)
+        embedding = batch["embedding"].to(device)
+        lm_target = [
+            paddle.tensor(
+                [IGNORE_ID] * (2 + text_token_len[i])
+                + speech_token[i, : speech_token_len[i]].tolist()
+                + [self.speech_token_size]
+            )
+            for i in range(text_token.size(0))
+        ]
+        lm_target = torch.nn.utils.rnn.pad_sequence(
+            lm_target, batch_first=True, padding_value=IGNORE_ID
+        ).to(device)
+        text_token = self.text_embedding(text_token)
+        text_token, text_token_len = self.encode(text_token, text_token_len)
+        embedding = paddle.nn.functional.normalize(x=embedding, axis=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        embedding = embedding.unsqueeze(1)
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        speech_token = self.speech_embedding(speech_token)
+        lm_input, lm_input_len = self.pad_unpad_sequence(
+            sos_eos_emb,
+            embedding,
+            text_token,
+            text_token_len,
+            task_id_emb,
+            speech_token,
+            speech_token_len,
+        )
+        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        loss = self.criterion_ce(logits, lm_target)
+        acc = th_accuracy(
+            logits.view(-1, self.speech_token_size + 1),
+            lm_target,
+            ignore_label=IGNORE_ID,
+        )
+        return {"loss": loss, "acc": acc}
+
+    def sampling_ids(
+        self,
+        weighted_scores: paddle.Tensor,
+        decoded_tokens: List,
+        sampling: int,
+        ignore_eos: bool = True,
+    ):
+        num_trials, max_trials = 0, 100
+        while True:
+            top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
+            if not ignore_eos or self.speech_token_size not in top_ids:
+                break
+            num_trials += 1
+            if num_trials > max_trials:
+                raise RuntimeError(
+                    "sampling reaches max_trials {} and still get eos when ignore_eos is True, check your input!".format(
+                        max_trials
+                    )
+                )
+        return top_ids
+
+    @paddle.no_grad()
+    def inference(
+        self,
+        text: paddle.Tensor,
+        text_len: paddle.Tensor,
+        prompt_text: paddle.Tensor,
+        prompt_text_len: paddle.Tensor,
+        prompt_speech_token: paddle.Tensor,
+        prompt_speech_token_len: paddle.Tensor,
+        embedding: paddle.Tensor,
+        sampling: int = 25,
+        max_token_text_ratio: float = 20,
+        min_token_text_ratio: float = 2,
+        uuid: str = "",
+    ) -> Generator[paddle.Tensor, None, None]:
+        device = text.place
+        text = paddle.cat([prompt_text, text], dim=1)
+        text_len += prompt_text_len
+        text = self.text_embedding(text)
+        text, text_len = self.encode(text, text_len)
+        if embedding.shape[0] != 0:
+            embedding = paddle.nn.functional.normalize(x=embedding, axis=1)
+            embedding = self.spk_embed_affine_layer(embedding)
+            embedding = embedding.unsqueeze(dim=1)
+        else:
+            embedding = (
+                paddle.zeros(1, 0, self.llm_input_size, dtype=text.dtype)
+                .to(device)
+                .to(text.dtype)
+            )
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = paddle.zeros(
+                1, 0, self.llm_input_size, dtype=text.dtype
+            ).to(device)
+        lm_input = paddle.cat(
+            [sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1
+        )
+        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+        out_tokens = []
+        offset = 0
+        att_cache, cnn_cache = paddle.zeros(
+            (0, 0, 0, 0), device=lm_input.place
+        ), paddle.zeros((0, 0, 0, 0), device=lm_input.place)
+        for i in range(max_len):
+            y_pred, att_cache, cnn_cache = self.llm.forward_chunk(
+                lm_input,
+                offset=offset,
+                required_cache_size=-1,
+                att_cache=att_cache,
+                cnn_cache=cnn_cache,
+                att_mask=paddle.tril(
+                    paddle.ones(
+                        (1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.place
+                    )
+                ).to(paddle.bool),
+            )
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            if i == 0:
+                logp[:, self.speech_token_size] = -float("inf")
+            top_ids = self.sampling_ids(
+                logp.squeeze(dim=0),
+                out_tokens,
+                sampling,
+                ignore_eos=True if i < min_len else False,
+            ).item()
+            if top_ids == self.speech_token_size:
+                break
+            yield top_ids
+            out_tokens.append(top_ids)
+            offset += lm_input.size(1)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+
+
+class Qwen2Encoder(paddle.nn.Layer):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, xs: paddle.Tensor, xs_lens: paddle.Tensor):
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T)
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=masks,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+        return outs.hidden_states[-1], masks.unsqueeze(1)
+
+    def forward_one_step(self, xs, masks, cache=None):
+        input_masks = masks[:, -1, :]
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=input_masks,
+            output_hidden_states=True,
+            return_dict=True,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        xs = outs.hidden_states[-1]
+        new_cache = outs.past_key_values
+        xs = paddle.cast(xs, dtype = 'float32')
+        return xs, new_cache
+
+
+class Qwen2LM(TransformerLM):
+    def __init__(
+        self,
+        llm_input_size: int,
+        llm_output_size: int,
+        speech_token_size: int,
+        llm: paddle.nn.Layer,
+        sampling: Callable,
+        length_normalized_loss: bool = True,
+        lsm_weight: float = 0.0,
+        mix_ratio: List[int] = [5, 15],
+    ):
+        paddle.nn.Layer.__init__(self)
+        self.llm_input_size = llm_input_size
+        self.llm_output_size = llm_output_size
+        self.speech_token_size = speech_token_size
+        self.sos_eos = 0
+        self.task_id = 1
+        self.fill_token = 2
+        self.llm_embedding = paddle.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = paddle.nn.Linear(
+            in_features=llm_output_size, out_features=speech_token_size + 3
+        )
+        # self.llm_decoder.weight = paddle.create_parameter(
+        #     shape=self.llm_decoder.weight.shape,
+        #     dtype='bfloat16',
+        #     default_initializer=paddle.nn.initializer.Assign(self.llm_decoder.weight.astype('bfloat16'))
+        # )
+        # if self.llm_decoder.bias is not None:
+        #     self.llm_decoder.bias = paddle.create_parameter(
+        #         shape=self.llm_decoder.bias.shape,
+        #         dtype='bfloat16',
+        #         default_initializer=paddle.nn.initializer.Assign(self.llm_decoder.bias.astype('bfloat16'))
+        #     )
+        # self.criterion_ce = LabelSmoothingLoss(
+        #     size=speech_token_size + 3,
+        #     padding_idx=IGNORE_ID,
+        #     smoothing=lsm_weight,
+        #     normalize_length=length_normalized_loss,
+        # )
+        self.speech_embedding = paddle.nn.Embedding(
+            speech_token_size + 3, llm_input_size
+        )
+        self.sampling = sampling
+        self.mix_ratio = mix_ratio
+        self.stop_token_ids = [(speech_token_size + i) for i in range(3)]
+        self.vllm_output_queue = {}
+
+    # def prepare_lm_input_target(
+    #     self,
+    #     text_token,
+    #     text_token_emb,
+    #     text_token_len,
+    #     speech_token,
+    #     speech_token_emb,
+    #     speech_token_len,
+    # ):
+    #     lm_target, lm_input = [], []
+    #     text_token = torch.nn.utils.rnn.unpad_sequence(
+    #         text_token, text_token_len.cpu(), batch_first=True
+    #     )
+    #     speech_token = torch.nn.utils.rnn.unpad_sequence(
+    #         speech_token, speech_token_len.cpu(), batch_first=True
+    #     )
+    #     text_token_emb = torch.nn.utils.rnn.unpad_sequence(
+    #         text_token_emb, text_token_len.cpu(), batch_first=True
+    #     )
+    #     speech_token_emb = torch.nn.utils.rnn.unpad_sequence(
+    #         speech_token_emb, speech_token_len.cpu(), batch_first=True
+    #     )
+    #     for i in range(len(text_token)):
+    #         if (
+    #             random.random() < 0.5
+    #             and speech_token_len[i] / text_token_len[i]
+    #             > self.mix_ratio[1] / self.mix_ratio[0]
+    #         ):
+    #             this_lm_target, this_lm_input = [], []
+    #             this_lm_target.append(IGNORE_ID)
+    #             this_lm_input.append(
+    #                 self.llm_embedding.weight[self.sos_eos].reshape(1, -1)
+    #             )
+    #             for j in range(
+    #                 ((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item()
+    #             ):
+    #                 this_text_token = text_token[i][
+    #                     j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
+    #                 ].tolist()
+    #                 this_speech_token = speech_token[i][
+    #                     j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
+    #                 ].tolist()
+    #                 if len(this_text_token) == self.mix_ratio[0]:
+    #                     assert len(this_speech_token) == self.mix_ratio[1]
+    #                     this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1)
+    #                     this_lm_target += this_speech_token
+    #                     this_lm_target.append(self.speech_token_size + 2)
+    #                     this_lm_input.append(
+    #                         text_token_emb[i][
+    #                             j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
+    #                         ]
+    #                     )
+    #                     this_lm_input.append(
+    #                         speech_token_emb[i][
+    #                             j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
+    #                         ]
+    #                     )
+    #                 else:
+    #                     this_lm_target += [-1] * len(this_text_token)
+    #                     this_lm_target += speech_token[i][
+    #                         j * self.mix_ratio[1] :
+    #                     ].tolist()
+    #                     this_lm_target.append(self.speech_token_size)
+    #                     this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0] :])
+    #                     this_lm_input.append(
+    #                         self.llm_embedding.weight[self.task_id].reshape(1, -1)
+    #                     )
+    #                     this_lm_input.append(
+    #                         speech_token_emb[i][j * self.mix_ratio[1] :]
+    #                     )
+    #             this_lm_target, this_lm_input = paddle.tensor(
+    #                 this_lm_target
+    #             ), paddle.cat(this_lm_input, dim=0)
+    #         else:
+    #             this_lm_target = paddle.tensor(
+    #                 [IGNORE_ID] * (1 + text_token_len[i])
+    #                 + speech_token[i].tolist()
+    #                 + [self.speech_token_size]
+    #             )
+    #             this_lm_input = paddle.cat(
+    #                 [
+    #                     self.llm_embedding.weight[self.sos_eos].reshape(1, -1),
+    #                     text_token_emb[i],
+    #                     self.llm_embedding.weight[self.task_id].reshape(1, -1),
+    #                     speech_token_emb[i],
+    #                 ],
+    #                 dim=0,
+    #             )
+    #         lm_target.append(this_lm_target)
+    #         lm_input.append(this_lm_input)
+    #     lm_input_len = paddle.tensor([i.size(0) for i in lm_input], dtype=paddle.int32)
+    #     lm_input = torch.nn.utils.rnn.pad_sequence(
+    #         lm_input, batch_first=True, padding_value=IGNORE_ID
+    #     )
+    #     lm_target = torch.nn.utils.rnn.pad_sequence(
+    #         lm_target, batch_first=True, padding_value=IGNORE_ID
+    #     )
+    #     return lm_target, lm_input, lm_input_len
+
+    @paddle.no_grad()
+    def inference(
+        self,
+        text: paddle.Tensor,
+        text_len: paddle.Tensor,
+        prompt_text: paddle.Tensor,
+        prompt_text_len: paddle.Tensor,
+        prompt_speech_token: paddle.Tensor,
+        prompt_speech_token_len: paddle.Tensor,
+        embedding: paddle.Tensor,
+        sampling: int = 25,
+        max_token_text_ratio: float = 20,
+        min_token_text_ratio: float = 2,
+        uuid: str = "",
+    ) -> Generator[paddle.Tensor, None, None]:
+        device = text.place
+        text = paddle.cat([prompt_text, text], dim=1)
+        text_len += prompt_text_len
+        text = self.llm.model.qwen2.embed_tokens(text)
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape([1, 1, -1])
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape([1, 1, -1])
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = paddle.zeros(
+                1, 0, self.llm_input_size, dtype=text.dtype
+            ).to(device)
+        text = paddle.cast(text,dtype = 'float32')
+        lm_input = paddle.cat(
+            [sos_eos_emb, text, task_id_emb, prompt_speech_token_emb], dim=1
+        )
+        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+        for token in self.inference_wrapper(lm_input, sampling, min_len, max_len, uuid):
+            yield token
+
+    @paddle.no_grad()
+    def inference_wrapper(self, lm_input, sampling, min_len, max_len, uuid):
+        if hasattr(self, "vllm"):
+            from vllm import RequestOutput, SamplingParams
+
+            sampling_params = SamplingParams(
+                top_k=sampling,
+                stop_token_ids=self.stop_token_ids,
+                min_tokens=min_len,
+                max_tokens=max_len,
+            )
+            with self.lock:
+                self.vllm.add_request(
+                    uuid,
+                    {
+                        "prompt_embeds": lm_input.squeeze(0)
+                        .to(paddle.bfloat16)
+                        .to(lm_input.place)
+                    },
+                    sampling_params,
+                )
+                self.vllm_output_queue[uuid] = queue.Queue()
+            out_tokens = []
+            while True:
+                with self.lock:
+                    if self.vllm_output_queue[uuid].empty() is True:
+                        request_outputs: List[RequestOutput] = self.vllm.step()
+                        for request_output in request_outputs:
+                            top_ids = list(request_output.outputs[0].token_ids)[-1]
+                            self.vllm_output_queue[request_output.request_id].put(
+                                top_ids
+                            )
+                if self.vllm_output_queue[uuid].empty() is False:
+                    top_ids = self.vllm_output_queue[uuid].get()
+                    if top_ids in self.stop_token_ids:
+                        break
+                    yield top_ids
+                    out_tokens.append(top_ids)
+                    if len(out_tokens) == max_len:
+                        break
+                time.sleep(0.001)
+            with self.lock:
+                self.vllm_output_queue.pop(uuid)
+        else:
+            out_tokens = []
+            cache = None
+            for i in range(max_len):
+                y_pred, cache = self.llm.forward_one_step(
+                    lm_input,
+                    masks=paddle.tril(
+                        paddle.ones(
+                            (1, lm_input.shape[1], lm_input.shape[1]),
+                        )
+                    ).to(paddle.bool),
+                    cache=cache,
+                )
+                logp = F.log_softmax(self.llm_decoder(y_pred[:, -1]), axis = -1)
+                top_ids = self.sampling_ids(
+                    logp.squeeze(axis=0),
+                    out_tokens,
+                    sampling,
+                    ignore_eos=True if i < min_len else False,
+                ).item()
+                if top_ids == self.speech_token_size:
+                    break
+                if top_ids > self.speech_token_size:
+                    continue
+                yield top_ids
+                out_tokens.append(top_ids)
+                lm_input = self.speech_embedding.weight[top_ids].reshape([1, 1, -1])
+
+    @paddle.no_grad()
+    def inference_bistream(
+        self,
+        text: Generator,
+        prompt_text: paddle.Tensor,
+        prompt_text_len: paddle.Tensor,
+        prompt_speech_token: paddle.Tensor,
+        prompt_speech_token_len: paddle.Tensor,
+        embedding: paddle.Tensor,
+        sampling: int = 25,
+        max_token_text_ratio: float = 20,
+        min_token_text_ratio: float = 2,
+    ) -> Generator[paddle.Tensor, None, None]:
+        device = prompt_text.place
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = paddle.zeros(
+                1, 0, self.llm_input_size, dtype=prompt_text.dtype
+            ).to(device)
+        lm_input = paddle.cat([sos_eos_emb], dim=1)
+        out_tokens = []
+        cache = None
+        text_cache = self.llm.model.model.embed_tokens(prompt_text)
+        next_fill_index = -1
+        for this_text in text:
+            text_cache = paddle.cat(
+                [text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1
+            )
+            while prompt_speech_token_emb.size(1) != 0:
+                if text_cache.size(1) >= self.mix_ratio[0]:
+                    lm_input_text, lm_input_speech = (
+                        text_cache[:, : self.mix_ratio[0]],
+                        prompt_speech_token_emb[:, : self.mix_ratio[1]],
+                    )
+                    logging.info(
+                        "append {} text token {} speech token".format(
+                            lm_input_text.size(1), lm_input_speech.size(1)
+                        )
+                    )
+                    lm_input = paddle.cat(
+                        [lm_input, lm_input_text, lm_input_speech], dim=1
+                    )
+                    text_cache, prompt_speech_token_emb = (
+                        text_cache[:, self.mix_ratio[0] :],
+                        prompt_speech_token_emb[:, self.mix_ratio[1] :],
+                    )
+                else:
+                    logging.info("not enough text token to decode, wait for more")
+                    break
+            if prompt_speech_token_emb.size(1) == 0:
+                if (
+                    len(out_tokens) != 0
+                    and out_tokens[-1] == self.speech_token_size + 2
+                    or len(out_tokens) == 0
+                    and lm_input.size(1) == 1
+                ):
+                    logging.info("get fill token, need to append more text token")
+                    if text_cache.size(1) >= self.mix_ratio[0]:
+                        lm_input_text = text_cache[:, : self.mix_ratio[0]]
+                        logging.info(
+                            "append {} text token".format(lm_input_text.size(1))
+                        )
+                        if (
+                            len(out_tokens) != 0
+                            and out_tokens[-1] == self.speech_token_size + 2
+                        ):
+                            lm_input = lm_input_text
+                        else:
+                            lm_input = paddle.cat([lm_input, lm_input_text], dim=1)
+                        text_cache = text_cache[:, self.mix_ratio[0] :]
+                    else:
+                        logging.info("not enough text token to decode, wait for more")
+                        continue
+                while True:
+                    seq_len = (
+                        lm_input.shape[1]
+                        if cache is None
+                        else lm_input.shape[1] + cache[0][0].size(2)
+                    )
+                    y_pred, cache = self.llm.forward_one_step(
+                        lm_input,
+                        masks=paddle.tril(
+                            paddle.ones((1, seq_len, seq_len), device=lm_input.place)
+                        ).to(paddle.bool),
+                        cache=cache,
+                    )
+                    logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+                    if next_fill_index != -1 and len(out_tokens) == next_fill_index:
+                        top_ids = self.speech_token_size + 2
+                        next_fill_index += self.mix_ratio[1] + 1
+                    else:
+                        top_ids = self.sampling_ids(
+                            logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True
+                        ).item()
+                    if top_ids == self.speech_token_size + 2:
+                        next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1
+                        logging.info(
+                            "fill_token index {} next fill_token index {}".format(
+                                len(out_tokens), next_fill_index
+                            )
+                        )
+                    out_tokens.append(top_ids)
+                    if top_ids >= self.speech_token_size:
+                        if top_ids == self.speech_token_size + 2:
+                            break
+                        else:
+                            raise ValueError("should not get token {}".format(top_ids))
+                    yield top_ids
+                    lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+        lm_input = paddle.cat([lm_input, text_cache, task_id_emb], dim=1)
+        logging.info("no more text token, decode until met eos")
+        while True:
+            seq_len = (
+                lm_input.shape[1]
+                if cache is None
+                else lm_input.shape[1] + cache[0][0].size(2)
+            )
+            y_pred, cache = self.llm.forward_one_step(
+                lm_input,
+                masks=paddle.tril(
+                    paddle.ones((1, seq_len, seq_len), device=lm_input.place)
+                ).to(paddle.bool),
+                cache=cache,
+            )
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(
+                logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=False
+            ).item()
+            out_tokens.append(top_ids)
+            if top_ids >= self.speech_token_size:
+                if top_ids == self.speech_token_size:
+                    break
+                else:
+                    raise ValueError("should not get token {}".format(top_ids))
+            yield top_ids
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
diff --git a/paddlespeech/t2s/models/CosyVoice/model.py b/paddlespeech/t2s/models/CosyVoice/model.py
new file mode 100644
index 0000000000..aff8573bba
--- /dev/null
+++ b/paddlespeech/t2s/models/CosyVoice/model.py
@@ -0,0 +1,597 @@
+import os
+import threading
+import time
+import uuid
+from contextlib import nullcontext
+from typing import Generator
+
+import numpy as np
+import paddle
+
+# from cosyvoice.utils.common import TrtContextWrapper, fade_in_out
+# from cosyvoice.utils.file_utils import *
+# from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm
+
+
+class CosyVoiceModel:
+    def __init__(
+        self,
+        llm: paddle.nn.Layer,
+        flow: paddle.nn.Layer,
+        hift: paddle.nn.Layer,
+        fp16: bool = False,
+    ):
+        self.device = device2str(
+            "cuda" if paddle.device.cuda.device_count() >= 1 else "cpu"
+        )
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+        self.fp16 = fp16
+        if self.fp16 is True:
+            self.llm.half()
+            self.flow.half()
+        self.token_min_hop_len = 2 * self.flow.input_frame_rate
+        self.token_max_hop_len = 4 * self.flow.input_frame_rate
+        self.token_overlap_len = 20
+        self.mel_overlap_len = int(
+            self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256
+        )
+        self.mel_window = np.hamming(2 * self.mel_overlap_len)
+        self.mel_cache_len = 20
+        self.source_cache_len = int(self.mel_cache_len * 256)
+        self.speech_window = np.hamming(2 * self.source_cache_len)
+        self.stream_scale_factor = 1
+        assert (
+            self.stream_scale_factor >= 1
+        ), "stream_scale_factor should be greater than 1, change it according to your actual rtf"
+        self.llm_context = (
+            paddle.device.stream_guard(
+                paddle.device.Stream(device=device2str(self.device))
+            )
+            if paddle.device.cuda.device_count() >= 1
+            else nullcontext()
+        )
+        self.lock = threading.Lock()
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.mel_overlap_dict = {}
+        self.flow_cache_dict = {}
+        self.hift_cache_dict = {}
+
+    def load(self, llm_model, flow_model, hift_model):
+        self.llm.set_state_dict(state_dict=paddle.load(path=str(llm_model)))
+        self.llm.to(self.device).eval()
+        self.flow.set_state_dict(state_dict=paddle.load(path=str(flow_model)))
+        self.flow.to(self.device).eval()
+        hift_state_dict = {
+            k.replace("generator.", ""): v
+            for k, v in paddle.load(path=str(hift_model)).items()
+        }
+        self.hift.set_state_dict(state_dict=hift_state_dict)
+        self.hift.to(self.device).eval()
+
+    def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
+        llm_text_encoder = torch.jit.load(
+            llm_text_encoder_model, map_location=self.device
+        )
+        self.llm.text_encoder = llm_text_encoder
+        llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
+        self.llm.llm = llm_llm
+        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
+        self.flow.encoder = flow_encoder
+
+    def load_trt(
+        self,
+        flow_decoder_estimator_model,
+        flow_decoder_onnx_model,
+        trt_concurrent,
+        fp16,
+    ):
+        assert paddle.device.cuda.device_count() >= 1, "tensorrt only supports gpu!"
+        if (
+            not os.path.exists(flow_decoder_estimator_model)
+            or os.path.getsize(flow_decoder_estimator_model) == 0
+        ):
+            convert_onnx_to_trt(
+                flow_decoder_estimator_model,
+                self.get_trt_kwargs(),
+                flow_decoder_onnx_model,
+                fp16,
+            )
+        del self.flow.decoder.estimator
+        import tensorrt as trt
+
+        with open(flow_decoder_estimator_model, "rb") as f:
+            estimator_engine = trt.Runtime(
+                trt.Logger(trt.Logger.INFO)
+            ).deserialize_cuda_engine(f.read())
+        assert estimator_engine is not None, "failed to load trt {}".format(
+            flow_decoder_estimator_model
+        )
+        self.flow.decoder.estimator = TrtContextWrapper(
+            estimator_engine, trt_concurrent=trt_concurrent, device=self.device
+        )
+
+    def get_trt_kwargs(self):
+        min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
+        opt_shape = [(2, 80, 500), (2, 1, 500), (2, 80, 500), (2, 80, 500)]
+        max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
+        input_names = ["x", "mask", "mu", "cond"]
+        return {
+            "min_shape": min_shape,
+            "opt_shape": opt_shape,
+            "max_shape": max_shape,
+            "input_names": input_names,
+        }
+
+    def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
+        with self.llm_context, paddle.amp.auto_cast(
+            enable=self.fp16 is True and hasattr(self.llm, "vllm") is False
+        ):
+            if isinstance(text, Generator):
+                assert isinstance(self, CosyVoice2Model) and not hasattr(
+                    self.llm, "vllm"
+                ), "streaming input text is only implemented for CosyVoice2 and do not support vllm!"
+                for i in self.llm.inference_bistream(
+                    text=text,
+                    prompt_text=prompt_text.to(self.device),
+                    prompt_text_len=paddle.tensor(
+                        [prompt_text.shape[1]], dtype=paddle.int32
+                    ).to(self.device),
+                    prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                    prompt_speech_token_len=paddle.tensor(
+                        [llm_prompt_speech_token.shape[1]], dtype=paddle.int32
+                    ).to(self.device),
+                    embedding=llm_embedding.to(self.device),
+                ):
+                    self.tts_speech_token_dict[uuid].append(i)
+            else:
+                for i in self.llm.inference(
+                    text=text.to(self.device),
+                    text_len=paddle.tensor([text.shape[1]], dtype=paddle.int32).to(
+                        self.device
+                    ),
+                    prompt_text=prompt_text.to(self.device),
+                    prompt_text_len=paddle.tensor(
+                        [prompt_text.shape[1]], dtype=paddle.int32
+                    ).to(self.device),
+                    prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                    prompt_speech_token_len=paddle.tensor(
+                        [llm_prompt_speech_token.shape[1]], dtype=paddle.int32
+                    ).to(self.device),
+                    embedding=llm_embedding.to(self.device),
+                    uuid=uuid,
+                ):
+                    self.tts_speech_token_dict[uuid].append(i)
+        self.llm_end_dict[uuid] = True
+
+    def vc_job(self, source_speech_token, uuid):
+        self.tts_speech_token_dict[uuid] = source_speech_token.flatten().tolist()
+        self.llm_end_dict[uuid] = True
+
+    def token2wav(
+        self,
+        token,
+        prompt_token,
+        prompt_feat,
+        embedding,
+        uuid,
+        finalize=False,
+        speed=1.0,
+    ):
+        with paddle.amp.auto_cast(enable=self.fp16):
+            tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(
+                token=token.to(self.device),
+                token_len=paddle.tensor([token.shape[1]], dtype=paddle.int32).to(
+                    self.device
+                ),
+                prompt_token=prompt_token.to(self.device),
+                prompt_token_len=paddle.tensor(
+                    [prompt_token.shape[1]], dtype=paddle.int32
+                ).to(self.device),
+                prompt_feat=prompt_feat.to(self.device),
+                prompt_feat_len=paddle.tensor(
+                    [prompt_feat.shape[1]], dtype=paddle.int32
+                ).to(self.device),
+                embedding=embedding.to(self.device),
+                flow_cache=self.flow_cache_dict[uuid],
+            )
+        if self.mel_overlap_dict[uuid].shape[2] != 0:
+            tts_mel = fade_in_out(tts_mel, self.mel_overlap_dict[uuid], self.mel_window)
+        if self.hift_cache_dict[uuid] is not None:
+            hift_cache_mel, hift_cache_source = (
+                self.hift_cache_dict[uuid]["mel"],
+                self.hift_cache_dict[uuid]["source"],
+            )
+            tts_mel = paddle.cat([hift_cache_mel, tts_mel], dim=2)
+        else:
+            hift_cache_source = paddle.zeros([1, 1, 0])
+        if finalize is False:
+            self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len :]
+            tts_mel = tts_mel[:, :, : -self.mel_overlap_len]
+            tts_speech, tts_source = self.hift.inference(
+                speech_feat=tts_mel, cache_source=hift_cache_source
+            )
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(
+                    tts_speech, self.hift_cache_dict[uuid]["speech"], self.speech_window
+                )
+            self.hift_cache_dict[uuid] = {
+                "mel": tts_mel[:, :, -self.mel_cache_len :],
+                "source": tts_source[:, :, -self.source_cache_len :],
+                "speech": tts_speech[:, -self.source_cache_len :],
+            }
+            tts_speech = tts_speech[:, : -self.source_cache_len]
+        else:
+            if speed != 1.0:
+                assert (
+                    self.hift_cache_dict[uuid] is None
+                ), "speed change only support non-stream inference mode"
+                tts_mel = paddle.nn.functional.interpolate(
+                    x=tts_mel, size=int(tts_mel.shape[2] / speed), mode="linear"
+                )
+            tts_speech, tts_source = self.hift.inference(
+                speech_feat=tts_mel, cache_source=hift_cache_source
+            )
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(
+                    tts_speech, self.hift_cache_dict[uuid]["speech"], self.speech_window
+                )
+        return tts_speech
+
+    def tts(
+        self,
+        text=paddle.zeros([1, 0], dtype=paddle.int32),
+        flow_embedding=paddle.zeros([0, 192]),
+        llm_embedding=paddle.zeros([0, 192]),
+        prompt_text=paddle.zeros([1, 0], dtype=paddle.int32),
+        llm_prompt_speech_token=paddle.zeros([1, 0], dtype=paddle.int32),
+        flow_prompt_speech_token=paddle.zeros([1, 0], dtype=paddle.int32),
+        prompt_speech_feat=paddle.zeros([1, 0, 80]),
+        source_speech_token=paddle.zeros([1, 0], dtype=paddle.int32),
+        stream=False,
+        speed=1.0,
+        **kwargs
+    ):
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = (
+                [],
+                False,
+            )
+            self.hift_cache_dict[this_uuid] = None
+            self.mel_overlap_dict[this_uuid] = paddle.zeros([1, 80, 0])
+            self.flow_cache_dict[this_uuid] = paddle.zeros([1, 80, 0, 2])
+        if source_speech_token.shape[1] == 0:
+            p = threading.Thread(
+                target=self.llm_job,
+                args=(
+                    text,
+                    prompt_text,
+                    llm_prompt_speech_token,
+                    llm_embedding,
+                    this_uuid,
+                ),
+            )
+        else:
+            p = threading.Thread(
+                target=self.vc_job, args=(source_speech_token, this_uuid)
+            )
+        """Not Support auto convert *.start, please judge whether it is Pytorch API and convert by yourself"""
+        p.start()
+        if stream is True:
+            token_hop_len = self.token_min_hop_len
+            while True:
+                time.sleep(0.1)
+                if (
+                    len(self.tts_speech_token_dict[this_uuid])
+                    >= token_hop_len + self.token_overlap_len
+                ):
+                    this_tts_speech_token = paddle.tensor(
+                        self.tts_speech_token_dict[this_uuid][
+                            : token_hop_len + self.token_overlap_len
+                        ]
+                    ).unsqueeze(dim=0)
+                    this_tts_speech = self.token2wav(
+                        token=this_tts_speech_token,
+                        prompt_token=flow_prompt_speech_token,
+                        prompt_feat=prompt_speech_feat,
+                        embedding=flow_embedding,
+                        uuid=this_uuid,
+                        finalize=False,
+                    )
+                    yield {"tts_speech": this_tts_speech.cpu()}
+                    with self.lock:
+                        self.tts_speech_token_dict[
+                            this_uuid
+                        ] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
+                    token_hop_len = min(
+                        self.token_max_hop_len,
+                        int(token_hop_len * self.stream_scale_factor),
+                    )
+                if (
+                    self.llm_end_dict[this_uuid] is True
+                    and len(self.tts_speech_token_dict[this_uuid])
+                    < token_hop_len + self.token_overlap_len
+                ):
+                    break
+            p.join()
+            this_tts_speech_token = paddle.tensor(
+                self.tts_speech_token_dict[this_uuid]
+            ).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(
+                token=this_tts_speech_token,
+                prompt_token=flow_prompt_speech_token,
+                prompt_feat=prompt_speech_feat,
+                embedding=flow_embedding,
+                uuid=this_uuid,
+                finalize=True,
+            )
+            yield {"tts_speech": this_tts_speech.cpu()}
+        else:
+            p.join()
+            this_tts_speech_token = paddle.tensor(
+                self.tts_speech_token_dict[this_uuid]
+            ).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(
+                token=this_tts_speech_token,
+                prompt_token=flow_prompt_speech_token,
+                prompt_feat=prompt_speech_feat,
+                embedding=flow_embedding,
+                uuid=this_uuid,
+                finalize=True,
+                speed=speed,
+            )
+            yield {"tts_speech": this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.mel_overlap_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)
+            self.flow_cache_dict.pop(this_uuid)
+        if paddle.device.cuda.device_count() >= 1:
+            paddle.device.cuda.empty_cache()
+            paddle.device.current_stream().synchronize()
+
+
+class CosyVoice2Model(CosyVoiceModel):
+    def __init__(
+        self,
+        llm: paddle.nn.Layer,
+        flow: paddle.nn.Layer,
+        hift: paddle.nn.Layer,
+        fp16: bool = False,
+    ):
+        self.device = device2str(
+            "cuda" if paddle.device.cuda.device_count() >= 1 else "cpu"
+        )
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+        self.fp16 = fp16
+        if self.fp16 is True:
+            self.llm.half()
+            self.flow.half()
+        self.token_hop_len = 25
+        self.mel_cache_len = 8
+        self.source_cache_len = int(self.mel_cache_len * 480)
+        self.speech_window = np.hamming(2 * self.source_cache_len)
+        self.llm_context = (
+            paddle.device.stream_guard(
+                paddle.device.Stream(device=device2str(self.device))
+            )
+            if paddle.device.cuda.device_count() >= 1
+            else nullcontext()
+        )
+        self.lock = threading.Lock()
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.hift_cache_dict = {}
+
+    def load_jit(self, flow_encoder_model):
+        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
+        self.flow.encoder = flow_encoder
+
+    def load_vllm(self, model_dir):
+        export_cosyvoice2_vllm(self.llm, model_dir, self.device)
+        from vllm import EngineArgs, LLMEngine
+
+        engine_args = EngineArgs(
+            model=model_dir,
+            skip_tokenizer_init=True,
+            enable_prompt_embeds=True,
+            gpu_memory_utilization=0.2,
+        )
+        self.llm.vllm = LLMEngine.from_engine_args(engine_args)
+        self.llm.lock = threading.Lock()
+        del self.llm.llm.model.model.layers
+
+    def token2wav(
+        self,
+        token,
+        prompt_token,
+        prompt_feat,
+        embedding,
+        token_offset,
+        uuid,
+        stream=False,
+        finalize=False,
+        speed=1.0,
+    ):
+        with paddle.amp.auto_cast(enable=self.fp16):
+            tts_mel, _ = self.flow.inference(
+                token=token.to(self.device),
+                token_len=paddle.tensor([token.shape[1]], dtype=paddle.int32).to(
+                    self.device
+                ),
+                prompt_token=prompt_token.to(self.device),
+                prompt_token_len=paddle.tensor(
+                    [prompt_token.shape[1]], dtype=paddle.int32
+                ).to(self.device),
+                prompt_feat=prompt_feat.to(self.device),
+                prompt_feat_len=paddle.tensor(
+                    [prompt_feat.shape[1]], dtype=paddle.int32
+                ).to(self.device),
+                embedding=embedding.to(self.device),
+                streaming=stream,
+                finalize=finalize,
+            )
+        tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio :]
+        if self.hift_cache_dict[uuid] is not None:
+            hift_cache_mel, hift_cache_source = (
+                self.hift_cache_dict[uuid]["mel"],
+                self.hift_cache_dict[uuid]["source"],
+            )
+            tts_mel = paddle.cat([hift_cache_mel, tts_mel], dim=2)
+        else:
+            hift_cache_source = paddle.zeros([1, 1, 0])
+        if finalize is False:
+            tts_speech, tts_source = self.hift.inference(
+                speech_feat=tts_mel, cache_source=hift_cache_source
+            )
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(
+                    tts_speech, self.hift_cache_dict[uuid]["speech"], self.speech_window
+                )
+            self.hift_cache_dict[uuid] = {
+                "mel": tts_mel[:, :, -self.mel_cache_len :],
+                "source": tts_source[:, :, -self.source_cache_len :],
+                "speech": tts_speech[:, -self.source_cache_len :],
+            }
+            tts_speech = tts_speech[:, : -self.source_cache_len]
+        else:
+            if speed != 1.0:
+                assert (
+                    self.hift_cache_dict[uuid] is None
+                ), "speed change only support non-stream inference mode"
+                tts_mel = paddle.nn.functional.interpolate(
+                    x=tts_mel, size=int(tts_mel.shape[2] / speed), mode="linear"
+                )
+            tts_speech, tts_source = self.hift.inference(
+                speech_feat=tts_mel, cache_source=hift_cache_source
+            )
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(
+                    tts_speech, self.hift_cache_dict[uuid]["speech"], self.speech_window
+                )
+        return tts_speech
+
+    def tts(
+        self,
+        text=paddle.zeros([1, 0], dtype=paddle.int32),
+        flow_embedding=paddle.zeros([0, 192]),
+        llm_embedding=paddle.zeros([0, 192]),
+        prompt_text=paddle.zeros([1, 0], dtype=paddle.int32),
+        llm_prompt_speech_token=paddle.zeros([1, 0], dtype=paddle.int32),
+        flow_prompt_speech_token=paddle.zeros([1, 0], dtype=paddle.int32),
+        prompt_speech_feat=paddle.zeros([1, 0, 80]),
+        source_speech_token=paddle.zeros([1, 0], dtype=paddle.int32),
+        stream=False,
+        speed=1.0,
+        **kwargs
+    ):
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = (
+                [],
+                False,
+            )
+            self.hift_cache_dict[this_uuid] = None
+        if source_speech_token.shape[1] == 0:
+            p = threading.Thread(
+                target=self.llm_job,
+                args=(
+                    text,
+                    prompt_text,
+                    llm_prompt_speech_token,
+                    llm_embedding,
+                    this_uuid,
+                ),
+            )
+        else:
+            p = threading.Thread(
+                target=self.vc_job, args=(source_speech_token, this_uuid)
+            )
+        """Not Support auto convert *.start, please judge whether it is Pytorch API and convert by yourself"""
+        p.start()
+        if stream is True:
+            token_offset = 0
+            prompt_token_pad = int(
+                np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len)
+                * self.token_hop_len
+                - flow_prompt_speech_token.shape[1]
+            )
+            while True:
+                time.sleep(0.1)
+                this_token_hop_len = (
+                    self.token_hop_len + prompt_token_pad
+                    if token_offset == 0
+                    else self.token_hop_len
+                )
+                if (
+                    len(self.tts_speech_token_dict[this_uuid]) - token_offset
+                    >= this_token_hop_len + self.flow.pre_lookahead_len
+                ):
+                    this_tts_speech_token = paddle.tensor(
+                        self.tts_speech_token_dict[this_uuid][
+                            : token_offset
+                            + this_token_hop_len
+                            + self.flow.pre_lookahead_len
+                        ]
+                    ).unsqueeze(dim=0)
+                    this_tts_speech = self.token2wav(
+                        token=this_tts_speech_token,
+                        prompt_token=flow_prompt_speech_token,
+                        prompt_feat=prompt_speech_feat,
+                        embedding=flow_embedding,
+                        token_offset=token_offset,
+                        uuid=this_uuid,
+                        stream=stream,
+                        finalize=False,
+                    )
+                    token_offset += this_token_hop_len
+                    yield {"tts_speech": this_tts_speech.cpu()}
+                if (
+                    self.llm_end_dict[this_uuid] is True
+                    and len(self.tts_speech_token_dict[this_uuid]) - token_offset
+                    < this_token_hop_len + self.flow.pre_lookahead_len
+                ):
+                    break
+            p.join()
+            this_tts_speech_token = paddle.tensor(
+                self.tts_speech_token_dict[this_uuid]
+            ).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(
+                token=this_tts_speech_token,
+                prompt_token=flow_prompt_speech_token,
+                prompt_feat=prompt_speech_feat,
+                embedding=flow_embedding,
+                token_offset=token_offset,
+                uuid=this_uuid,
+                finalize=True,
+            )
+            yield {"tts_speech": this_tts_speech.cpu()}
+        else:
+            p.join()
+            this_tts_speech_token = paddle.tensor(
+                self.tts_speech_token_dict[this_uuid]
+            ).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(
+                token=this_tts_speech_token,
+                prompt_token=flow_prompt_speech_token,
+                prompt_feat=prompt_speech_feat,
+                embedding=flow_embedding,
+                token_offset=0,
+                uuid=this_uuid,
+                finalize=True,
+                speed=speed,
+            )
+            yield {"tts_speech": this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)
+        if paddle.device.cuda.device_count() >= 1:
+            paddle.device.cuda.empty_cache()
+            paddle.device.current_stream().synchronize()
\ No newline at end of file
diff --git a/paddlespeech/t2s/models/CosyVoice/test.py b/paddlespeech/t2s/models/CosyVoice/test.py
new file mode 100644
index 0000000000..d78273f7c8
--- /dev/null
+++ b/paddlespeech/t2s/models/CosyVoice/test.py
@@ -0,0 +1,2 @@
+import torchaudio
+import 
\ No newline at end of file
diff --git a/paddlespeech/t2s/modules/decoder.py b/paddlespeech/t2s/modules/decoder.py
new file mode 100644
index 0000000000..b7fa01fc26
--- /dev/null
+++ b/paddlespeech/t2s/modules/decoder.py
@@ -0,0 +1,9 @@
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
\ No newline at end of file
diff --git a/paddlespeech/t2s/modules/flow/attention.py b/paddlespeech/t2s/modules/flow/attention.py
new file mode 100644
index 0000000000..b5a7069d38
--- /dev/null
+++ b/paddlespeech/t2s/modules/flow/attention.py
@@ -0,0 +1,227 @@
+
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        activation_fn: str = "geglu",
+        cross_attention_dim: Optional[int] = None,
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=None
+            upcast_attention=False
+        )
+        # 2. Cross-Attn
+        self.norm2 = None
+        self.attn2 = None
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(self,hidden_states):
+        norm_hidden_states = self.norm1(hidden_states)
+        cross_attention_kwargs = {}
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask,
+            **cross_attention_kwargs,
+        )
+        hidden_states = attn_output + hidden_states
+        norm_hidden_states = self.norm3(hidden_states)
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+
+class FeedForward(nn.Layer):
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, approximate=False)
+        elif activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate=True)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        elif activation_fn == "snakebeta":
+            act_fn = SnakeBeta(dim, inner_dim)
+        else:
+            act_fn = GEGLU(dim, inner_dim)
+
+        self.net = nn.LayerList()
+        self.net.append(act_fn)
+        self.net.append(nn.Dropout(dropout))
+        self.net.append(LoRACompatibleLinear(inner_dim, dim_out))
+        
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+
+query_dim=dim,
+heads=num_attention_heads,
+dim_head=attention_head_dim,
+dropout=dropout,
+bias=attention_bias,
+cross_attention_dim=None,
+upcast_attention=upcast_attention,
+class Attention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        qk_norm: Optional[str] = None,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+    ):
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias)
+        self.to_v = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        ):
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+
diff --git a/paddlespeech/t2s/modules/flow/decoder.py b/paddlespeech/t2s/modules/flow/decoder.py
new file mode 100644
index 0000000000..4c5208b50d
--- /dev/null
+++ b/paddlespeech/t2s/modules/flow/decoder.py
@@ -0,0 +1,766 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from einops import pack, rearrange, repeat
+from cosyvoice.utils.common import mask_to_bias
+from cosyvoice.utils.mask import add_optional_chunk_mask
+from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
+from .attention import BasicTransformerBlock
+
+def get_activation(act_fn):
+    if act_fn == "silu":
+        return nn.Silu()
+    elif act_fn == "mish":
+        return nn.Mish()
+    elif act_fn == "relu":
+        return nn.ReLU()
+    elif act_fn == "gelu":
+        return nn.GELU()
+    else:
+        raise ValueError(f"Unsupported activation function: {act_fn}")
+
+class Block1D(nn.Layer):
+    def __init__(self, dim, dim_out, groups=8):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.Conv1D(dim, dim_out, 3, padding=1),
+            nn.GroupNorm(groups, dim_out),
+            nn.Mish(),
+        )
+
+    def forward(self, x, mask):
+        output = self.block(x * mask)
+        return output * mask
+
+class ResnetBlock1D(nn.Layer):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Mish(), 
+            nn.Linear(time_emb_dim, dim_out)
+        )
+
+        self.block1 = Block1D(dim, dim_out, groups=groups)
+        self.block2 = Block1D(dim_out, dim_out, groups=groups)
+        self.res_conv = nn.Conv1D(dim, dim_out, 1)
+
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        # 添加时间嵌入并调整维度
+        h += self.mlp(time_emb).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+
+class Downsample1D(nn.Layer):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.Conv1D(dim, dim, 3, stride=2, padding=1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+class TimestepEmbedding(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+    ):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
+
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+
+        self.act = get_activation(act_fn)
+
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
+
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+
+    def forward(self, sample, condition=None):
+        if condition is not None and self.cond_proj is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+
+        if self.act is not None:
+            sample = self.act(sample)
+
+        sample = self.linear_2(sample)
+
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+
+class Upsample1D(nn.Layer):
+    """A 1D upsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+
+    def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+
+        self.conv = None
+        if use_conv_transpose:
+            self.conv = nn.Conv1DTranspose(channels, self.out_channels, 4, stride=2, padding=1)
+        elif use_conv:
+            self.conv = nn.Conv1D(self.channels, self.out_channels, 3, padding=1)
+
+    def forward(self, inputs):
+        assert inputs.shape[1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(inputs)
+
+        outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")
+
+        if self.use_conv:
+            outputs = self.conv(outputs)
+
+        return outputs
+
+class Transpose(nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = paddle.transpose(x, (self.dim0, self.dim1))
+        return x
+
+class CausalConv1d(nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        padding_mode: str = 'zeros'
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups,
+                                           padding_mode=padding_mode)
+        assert stride == 1
+        self.causal_padding = kernel_size - 1
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+
+class SinusoidalPosEmb(paddle.nn.Layer):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = paddle.exp(paddle.arange(half_dim).astype('float32') * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = paddle.concat([paddle.sin(emb), paddle.cos(emb)], axis=-1)
+        return emb
+
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2), 
+            nn.Mish()                     
+        )
+
+    def forward(self, x: paddle.Tensor, mask: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        output = self.block(x * mask)
+        return output * mask
+
+
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+) -> paddle.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+
+    Returns:
+        paddle.Tensor: mask
+
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    pos_idx = paddle.arange(size, dtype='int64')
+    block_value = (paddle.floor_divide(pos_idx, chunk_size) + 1) * chunk_size
+    ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
+    
+    return ret
+
+
+def add_optional_chunk_mask(xs: paddle.Tensor,
+                            masks: paddle.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int,
+                            static_chunk_size: int,
+                            num_decoding_left_chunks: int,
+                            enable_full_context: bool = True):
+    """ Apply optional mask for encoder.
+
+    Args:
+        xs (paddle.Tensor): padded input, (B, L, D), L for max length
+        mask (paddle.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+        enable_full_context (bool):
+            True: chunk size is either [1, 25] or full context(max_len)
+            False: chunk size ~ U[1, 25]
+
+    Returns:
+        paddle.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.shape[1]
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk size is either [1, 25] or full context(max_len).
+            # Since we use 4 times subsampling and allow up to 1s(100 frames)
+            # delay, the maximum frame is 100 / 4 = 25.
+            chunk_size = paddle.randint(1, max_len, shape=(1,)).item()
+            num_left_chunks = -1
+            if chunk_size > max_len // 2 and enable_full_context:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % 25 + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = paddle.randint(0, max_left_chunks, shape=(1,)).item()
+        
+        chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size,
+                                            num_left_chunks)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size,
+                                            num_left_chunks)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    else:
+        chunk_masks = masks
+    
+    assert chunk_masks.dtype == 'bool'
+    if (chunk_masks.sum(axis=-1) == 0).sum().item() != 0:
+        print('get chunk_masks all false at some timestep, force set to true, make sure they are masked in future computation!')
+        all_false_mask = chunk_masks.sum(axis=-1) == 0
+        chunk_masks = paddle.where(all_false_mask.unsqueeze(-1), paddle.ones_like(chunk_masks, dtype='bool'), chunk_masks)
+    
+    return chunk_masks
+
+def mask_to_bias(mask: paddle.Tensor, dtype: str) -> paddle.Tensor:
+    assert mask.dtype == 'bool', "Input mask must be of boolean type"
+    assert dtype in ['float32', 'bfloat16', 'float16'], f"Unsupported dtype: {dtype}"
+    mask = mask.astype(dtype)
+    mask = (1.0 - mask) * -1.0e+10
+    
+    return mask
+
+class ConditionalDecoder(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.LayerList([])
+        self.mid_blocks = nn.LayerList([])
+        self.up_blocks = nn.LayerList([])
+
+        output_channel = in_channels
+        for i in range(len(channels)):
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.LayerList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else nn.Conv1D(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.LayerList([resnet, transformer_blocks, downsample]))
+
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+
+            transformer_blocks = nn.LayerList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+
+            self.mid_blocks.append(nn.LayerList([resnet, transformer_blocks]))
+
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.LayerList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else nn.Conv1D(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.LayerList([resnet, transformer_blocks, upsample]))
+        self.final_block = Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1D(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv1D):
+                nn.initializer.KaimingNormal(m.weight, nonlinearity='relu')
+                if m.bias is not None:
+                    nn.initializer.Constant(m.bias, value=0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.initializer.Constant(m.weight, value=1)
+                nn.initializer.Constant(m.bias, value=0)
+            elif isinstance(m, nn.Linear):
+                nn.initializer.KaimingNormal(m.weight, nonlinearity='relu')
+                if m.bias is not None:
+                    nn.initializer.Constant(m.bias, value=0)
+
+    def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
+        """Forward pass of the UNet1DConditional model.
+
+        Args:
+            x (paddle.Tensor): shape (batch_size, in_channels, time)
+            mask (paddle.Tensor): shape (batch_size, 1, time)
+            t (paddle.Tensor): shape (batch_size)
+            spks (paddle.Tensor, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (paddle.Tensor, optional): placeholder for future use. Defaults to None.
+
+        Returns:
+            paddle.Tensor: output tensor
+        """
+
+        t = self.time_embeddings(t).astype(t.dtype)
+        t = self.time_mlp(t)
+
+        x = pack([x, mu], "b * t")[0]
+
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = add_optional_chunk_mask(x, mask_down.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = add_optional_chunk_mask(x, mask_mid.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = add_optional_chunk_mask(x, mask_up.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask
+
+
+class CausalConditionalDecoder(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+        static_chunk_size=50,
+        num_decoding_left_chunks=2,
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.time_embeddings = SinusoidalPosEmb(in_channels) 
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding( 
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.static_chunk_size = static_chunk_size
+        self.num_decoding_left_chunks = num_decoding_left_chunks
+        self.down_blocks = nn.LayerList([])
+        self.mid_blocks = nn.LayerList([])
+        self.up_blocks = nn.LayerList([])
+
+        output_channel = in_channels
+        for i in range(len(channels)):
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.LayerList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3)  # 假设已实现
+            )
+            self.down_blocks.append(nn.LayerList([resnet, transformer_blocks, downsample]))
+
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+
+            transformer_blocks = nn.LayerList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+
+            self.mid_blocks.append(nn.LayerList([resnet, transformer_blocks]))
+
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = CausalResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.LayerList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)  # 假设已实现
+                if not is_last
+                else CausalConv1d(output_channel, output_channel, 3)
+            )
+            self.up_blocks.append(nn.LayerList([resnet, transformer_blocks, upsample]))
+        self.final_block = CausalBlock1D(channels[-1], channels[-1])  # 假设已实现
+        self.final_proj = nn.Conv1D(channels[-1], self.out_channels, 1)  # 使用 Conv1D
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        for m in self.sublayers():  # 使用 sublayers() 而不是 modules()
+            if isinstance(m, nn.Conv1D):
+                nn.initializer.KaimingNormal(m.weight, nonlinearity='relu')
+                if m.bias is not None:
+                    nn.initializer.Constant(m.bias, value=0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.initializer.Constant(m.weight, value=1)
+                nn.initializer.Constant(m.bias, value=0)
+            elif isinstance(m, nn.Linear):
+                nn.initializer.KaimingNormal(m.weight, nonlinearity='relu')
+                if m.bias is not None:
+                    nn.initializer.Constant(m.bias, value=0)
+
+    def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
+        """Forward pass of the UNet1DConditional model.
+
+        Args:
+            x (paddle.Tensor): shape (batch_size, in_channels, time)
+            mask (paddle.Tensor): shape (batch_size, 1, time)
+            mu (paddle.Tensor): mean tensor for conditioning
+            t (paddle.Tensor): shape (batch_size)
+            spks (paddle.Tensor, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (paddle.Tensor, optional): placeholder for future use. Defaults to None.
+            streaming (bool, optional): whether to use streaming mode. Defaults to False.
+
+        Returns:
+            paddle.Tensor: output tensor
+        """
+        t = self.time_embeddings(t).astype(t.dtype)  # 使用 astype 代替 .to(t.dtype)
+        t = self.time_mlp(t)
+
+        x = pack([x, mu], "b * t")[0]  # 假设 pack 函数已实现
+
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])  # 假设 repeat 函数已实现
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()  # 假设 rearrange 函数已实现
+            if streaming is True:
+                attn_mask = add_optional_chunk_mask(x, mask_down.astype('bool'), False, False, 0, self.static_chunk_size, -1)  # 使用 astype('bool')
+            else:
+                attn_mask = add_optional_chunk_mask(x, mask_down.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1)  # 使用 .shape 而不是 .size()
+            attn_mask = mask_to_bias(attn_mask, x.dtype)  # 假设 mask_to_bias 函数已实现
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            if streaming is True:
+                attn_mask = add_optional_chunk_mask(x, mask_mid.astype('bool'), False, False, 0, self.static_chunk_size, -1)
+            else:
+                attn_mask = add_optional_chunk_mask(x, mask_mid.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            if streaming is True:
+                attn_mask = add_optional_chunk_mask(x, mask_up.astype('bool'), False, False, 0, self.static_chunk_size, -1)
+            else:
+                attn_mask = add_optional_chunk_mask(x, mask_up.astype('bool'), False, False, 0, 0, -1).repeat(1, x.shape[1], 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask
diff --git a/paddlespeech/t2s/modules/flow/flow.py b/paddlespeech/t2s/modules/flow/flow.py
new file mode 100644
index 0000000000..c22b7a2622
--- /dev/null
+++ b/paddlespeech/t2s/modules/flow/flow.py
@@ -0,0 +1,320 @@
+import logging
+import random
+from typing import Dict, Optional
+
+import paddle
+from omegaconf import DictConfig
+
+from cosyvoice.utils.mask import make_pad_mask
+
+
+class MaskedDiffWithXvec(paddle.nn.Layer):
+    def __init__(
+        self,
+        input_size: int = 512,
+        output_size: int = 80,
+        spk_embed_dim: int = 192,
+        output_type: str = "mel",
+        vocab_size: int = 4096,
+        input_frame_rate: int = 50,
+        only_mask_loss: bool = True,
+        encoder: paddle.nn.Layer = None,
+        length_regulator: paddle.nn.Layer = None,
+        decoder: paddle.nn.Layer = None,
+        decoder_conf: Dict = {
+            "in_channels": 240,
+            "out_channel": 80,
+            "spk_emb_dim": 80,
+            "n_spks": 1,
+            "cfm_params": DictConfig(
+                {
+                    "sigma_min": 1e-06,
+                    "solver": "euler",
+                    "t_scheduler": "cosine",
+                    "training_cfg_rate": 0.2,
+                    "inference_cfg_rate": 0.7,
+                    "reg_loss_type": "l1",
+                }
+            ),
+            "decoder_params": {
+                "channels": [256, 256],
+                "dropout": 0.0,
+                "attention_head_dim": 64,
+                "n_blocks": 4,
+                "num_mid_blocks": 12,
+                "num_heads": 8,
+                "act_fn": "gelu",
+            },
+        },
+        mel_feat_conf: Dict = {
+            "n_fft": 1024,
+            "num_mels": 80,
+            "sampling_rate": 22050,
+            "hop_size": 256,
+            "win_size": 1024,
+            "fmin": 0,
+            "fmax": 8000,
+        },
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = paddle.nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = paddle.nn.Linear(
+            in_features=spk_embed_dim, out_features=output_size
+        )
+        self.encoder = encoder
+        self.encoder_proj = paddle.nn.Linear(
+            in_features=self.encoder.output_size(), out_features=output_size
+        )
+        self.decoder = decoder
+        self.length_regulator = length_regulator
+        self.only_mask_loss = only_mask_loss
+
+    def forward(
+>>>>>>        self, batch: dict, device: torch.device
+    ) -> Dict[str, Optional[paddle.Tensor]]:
+        token = batch["speech_token"].to(device)
+        token_len = batch["speech_token_len"].to(device)
+        feat = batch["speech_feat"].to(device)
+        feat_len = batch["speech_feat_len"].to(device)
+        embedding = batch["embedding"].to(device)
+        embedding = paddle.nn.functional.normalize(x=embedding, axis=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
+        token = self.input_embedding(paddle.clamp(token, min=0)) * mask
+        h, h_lengths = self.encoder(token, token_len)
+        h = self.encoder_proj(h)
+        h, h_lengths = self.length_regulator(h, feat_len)
+        conds = paddle.zeros(feat.shape, device=token.place)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(feat_len)).to(h)
+        loss, _ = self.decoder.compute_loss(
+            feat.transpose(1, 2).contiguous(),
+            mask.unsqueeze(1),
+            h.transpose(1, 2).contiguous(),
+            embedding,
+            cond=conds,
+        )
+        return {"loss": loss}
+
+    @paddle.no_grad()
+    def inference(
+        self,
+        token,
+        token_len,
+        prompt_token,
+        prompt_token_len,
+        prompt_feat,
+        prompt_feat_len,
+        embedding,
+        flow_cache,
+    ):
+        assert token.shape[0] == 1
+        embedding = paddle.nn.functional.normalize(x=embedding, axis=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
+        token, token_len = (
+            paddle.cat([prompt_token, token], dim=1),
+            prompt_token_len + token_len,
+        )
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(paddle.clamp(token, min=0)) * mask
+        h, h_lengths = self.encoder(token, token_len)
+        h = self.encoder_proj(h)
+        mel_len1, mel_len2 = prompt_feat.shape[1], int(
+            token_len2 / self.input_frame_rate * 22050 / 256
+        )
+        h, h_lengths = self.length_regulator.inference(
+            h[:, :token_len1],
+            h[:, token_len1:],
+            mel_len1,
+            mel_len2,
+            self.input_frame_rate,
+        )
+        conds = paddle.zeros(
+            [1, mel_len1 + mel_len2, self.output_size], device=token.place
+        ).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(paddle.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, flow_cache = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10,
+            prompt_len=mel_len1,
+            cache=flow_cache,
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), flow_cache
+
+
+class CausalMaskedDiffWithXvec(paddle.nn.Layer):
+    def __init__(
+        self,
+        input_size: int = 512,
+        output_size: int = 80,
+        spk_embed_dim: int = 192,
+        output_type: str = "mel",
+        vocab_size: int = 6561,
+        input_frame_rate: int = 25,
+        only_mask_loss: bool = True,
+        token_mel_ratio: int = 2,
+        pre_lookahead_len: int = 3,
+        encoder: paddle.nn.Layer = None,
+        decoder: paddle.nn.Layer = None,
+        decoder_conf: Dict = {
+            "in_channels": 240,
+            "out_channel": 80,
+            "spk_emb_dim": 80,
+            "n_spks": 1,
+            "cfm_params": DictConfig(
+                {
+                    "sigma_min": 1e-06,
+                    "solver": "euler",
+                    "t_scheduler": "cosine",
+                    "training_cfg_rate": 0.2,
+                    "inference_cfg_rate": 0.7,
+                    "reg_loss_type": "l1",
+                }
+            ),
+            "decoder_params": {
+                "channels": [256, 256],
+                "dropout": 0.0,
+                "attention_head_dim": 64,
+                "n_blocks": 4,
+                "num_mid_blocks": 12,
+                "num_heads": 8,
+                "act_fn": "gelu",
+            },
+        },
+        mel_feat_conf: Dict = {
+            "n_fft": 1024,
+            "num_mels": 80,
+            "sampling_rate": 22050,
+            "hop_size": 256,
+            "win_size": 1024,
+            "fmin": 0,
+            "fmax": 8000,
+        },
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = paddle.nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = paddle.nn.Linear(
+            in_features=spk_embed_dim, out_features=output_size
+        )
+        self.encoder = encoder
+        self.encoder_proj = paddle.nn.Linear(
+            in_features=self.encoder.output_size(), out_features=output_size
+        )
+        self.decoder = decoder
+        self.only_mask_loss = only_mask_loss
+        self.token_mel_ratio = token_mel_ratio
+        self.pre_lookahead_len = pre_lookahead_len
+
+    def forward(
+>>>>>>        self, batch: dict, device: torch.device
+    ) -> Dict[str, Optional[paddle.Tensor]]:
+        token = batch["speech_token"].to(device)
+        token_len = batch["speech_token_len"].to(device)
+        feat = batch["speech_feat"].to(device)
+        feat_len = batch["speech_feat_len"].to(device)
+        embedding = batch["embedding"].to(device)
+        streaming = True if random.random() < 0.5 else False
+        embedding = paddle.nn.functional.normalize(x=embedding, axis=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
+        token = self.input_embedding(paddle.clamp(token, min=0)) * mask
+        h, h_lengths = self.encoder(token, token_len, streaming=streaming)
+        h = self.encoder_proj(h)
+        conds = paddle.zeros(feat.shape, device=token.place)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(h_lengths.sum(dim=-1).squeeze(dim=1))).to(h)
+        loss, _ = self.decoder.compute_loss(
+            feat.transpose(1, 2).contiguous(),
+            mask.unsqueeze(1),
+            h.transpose(1, 2).contiguous(),
+            embedding,
+            cond=conds,
+            streaming=streaming,
+        )
+        return {"loss": loss}
+
+    @paddle.no_grad()
+    def inference(
+        self,
+        token,
+        token_len,
+        prompt_token,
+        prompt_token_len,
+        prompt_feat,
+        prompt_feat_len,
+        embedding,
+        streaming,
+        finalize,
+    ):
+        assert token.shape[0] == 1
+        embedding = paddle.nn.functional.normalize(x=embedding, axis=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        token, token_len = (
+            paddle.cat([prompt_token, token], dim=1),
+            prompt_token_len + token_len,
+        )
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(paddle.clamp(token, min=0)) * mask
+        if finalize is True:
+            h, h_lengths = self.encoder(token, token_len, streaming=streaming)
+        else:
+            token, context = (
+                token[:, : -self.pre_lookahead_len],
+                token[:, -self.pre_lookahead_len :],
+            )
+            h, h_lengths = self.encoder(
+                token, token_len, context=context, streaming=streaming
+            )
+        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
+        h = self.encoder_proj(h)
+        conds = paddle.zeros(
+            [1, mel_len1 + mel_len2, self.output_size], device=token.place
+        ).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(paddle.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, _ = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10,
+            streaming=streaming,
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), None
diff --git a/paddlespeech/t2s/modules/flow/flow_matching.py b/paddlespeech/t2s/modules/flow/flow_matching.py
new file mode 100644
index 0000000000..bf8c9f7a0f
--- /dev/null
+++ b/paddlespeech/t2s/modules/flow/flow_matching.py
@@ -0,0 +1,250 @@
+import paddle
+from matcha.models.components.flow_matching import BASECFM
+
+from cosyvoice.utils.common import set_all_random_seed
+
+
+class ConditionalCFM(BASECFM):
+    def __init__(
+        self,
+        in_channels,
+        cfm_params,
+        n_spks=1,
+        spk_emb_dim=64,
+        estimator: paddle.nn.Layer = None,
+    ):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+            n_spks=n_spks,
+            spk_emb_dim=spk_emb_dim,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
+        self.estimator = estimator
+
+    @paddle.no_grad()
+    def forward(
+        self,
+        mu,
+        mask,
+        n_timesteps,
+        temperature=1.0,
+        spks=None,
+        cond=None,
+        prompt_len=0,
+        cache=paddle.zeros(1, 80, 0, 2),
+    ):
+        """Forward diffusion
+
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = (
+            paddle.randn(shape=mu.shape, dtype=mu.dtype).to(mu.place).to(mu.dtype)
+            * temperature
+        )
+        cache_size = cache.shape[2]
+        if cache_size != 0:
+            z[:, :, :cache_size] = cache[:, :, :, 0]
+            mu[:, :, :cache_size] = cache[:, :, :, 1]
+        z_cache = paddle.cat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
+        mu_cache = paddle.cat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
+        cache = paddle.stack([z_cache, mu_cache], dim=-1)
+        t_span = paddle.linspace(start=0, stop=1, num=n_timesteps + 1, dtype=mu.dtype)
+        if self.t_scheduler == "cosine":
+            t_span = 1 - paddle.cos(t_span * 0.5 * paddle.pi)
+        return (
+            self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond),
+            cache,
+        )
+
+    def solve_euler(self, x, t_span, mu, mask, spks, cond, streaming=False):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        sol = []
+        x_in = paddle.zeros([2, 80, x.size(2)], device=x.place, dtype=x.dtype)
+        mask_in = paddle.zeros([2, 1, x.size(2)], device=x.place, dtype=x.dtype)
+        mu_in = paddle.zeros([2, 80, x.size(2)], device=x.place, dtype=x.dtype)
+        t_in = paddle.zeros([2], device=x.place, dtype=x.dtype)
+        spks_in = paddle.zeros([2, 80], device=x.place, dtype=x.dtype)
+        cond_in = paddle.zeros([2, 80, x.size(2)], device=x.place, dtype=x.dtype)
+        for step in range(1, len(t_span)):
+            x_in[:] = x
+            mask_in[:] = mask
+            mu_in[0] = mu
+            t_in[:] = t.unsqueeze(0)
+            spks_in[0] = spks
+            cond_in[0] = cond
+            dphi_dt = self.forward_estimator(
+                x_in, mask_in, mu_in, t_in, spks_in, cond_in, streaming
+            )
+            dphi_dt, cfg_dphi_dt = paddle.compat.split(
+                dphi_dt, [x.size(0), x.size(0)], dim=0
+            )
+            dphi_dt = (
+                1.0 + self.inference_cfg_rate
+            ) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1].float()
+
+    def forward_estimator(self, x, mask, mu, t, spks, cond, streaming=False):
+        if isinstance(self.estimator, paddle.nn.Layer):
+            return self.estimator(x, mask, mu, t, spks, cond, streaming=streaming)
+        else:
+            [estimator, stream], trt_engine = self.estimator.acquire_estimator()
+            paddle.device.current_stream().synchronize()
+            with stream:
+                estimator.set_input_shape("x", (2, 80, x.size(2)))
+                estimator.set_input_shape("mask", (2, 1, x.size(2)))
+                estimator.set_input_shape("mu", (2, 80, x.size(2)))
+                estimator.set_input_shape("t", (2,))
+                estimator.set_input_shape("spks", (2, 80))
+                estimator.set_input_shape("cond", (2, 80, x.size(2)))
+                data_ptrs = [
+                    x.contiguous().data_ptr(),
+                    mask.contiguous().data_ptr(),
+                    mu.contiguous().data_ptr(),
+                    t.contiguous().data_ptr(),
+                    spks.contiguous().data_ptr(),
+                    cond.contiguous().data_ptr(),
+                    x.data_ptr(),
+                ]
+                for i, j in enumerate(data_ptrs):
+                    estimator.set_tensor_address(trt_engine.get_tensor_name(i), j)
+                assert (
+                    estimator.execute_async_v3(
+                        paddle.device.current_stream().cuda_stream
+                    )
+                    is True
+                )
+                paddle.device.current_stream().synchronize()
+            self.estimator.release_estimator(estimator, stream)
+            return x
+
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None, streaming=False):
+        """Computes diffusion loss
+
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        t = paddle.rand(shape=[b, 1, 1], dtype=mu.dtype)
+        if self.t_scheduler == "cosine":
+            t = 1 - paddle.cos(t * 0.5 * paddle.pi)
+        z = paddle.randn(shape=x1.shape, dtype=x1.dtype)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        if self.training_cfg_rate > 0:
+            cfg_mask = paddle.rand(shape=b) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            spks = spks * cfg_mask.view(-1, 1)
+            cond = cond * cfg_mask.view(-1, 1, 1)
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
+        loss = paddle.nn.functional.mse_loss(
+            input=pred * mask, label=u * mask, reduction="sum"
+        ) / (paddle.sum(mask) * u.shape[1])
+        return loss, y
+
+
+class CausalConditionalCFM(ConditionalCFM):
+    def __init__(
+        self,
+        in_channels,
+        cfm_params,
+        n_spks=1,
+        spk_emb_dim=64,
+        estimator: paddle.nn.Layer = None,
+    ):
+        super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator)
+        set_all_random_seed(0)
+        self.rand_noise = paddle.randn([1, 80, 50 * 300])
+
+    @paddle.no_grad()
+    def forward(
+        self,
+        mu,
+        mask,
+        n_timesteps,
+        temperature=1.0,
+        spks=None,
+        cond=None,
+        streaming=False,
+    ):
+        """Forward diffusion
+
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = self.rand_noise[:, :, : mu.size(2)].to(mu.place).to(mu.dtype) * temperature
+        t_span = paddle.linspace(start=0, stop=1, num=n_timesteps + 1, dtype=mu.dtype)
+        if self.t_scheduler == "cosine":
+            t_span = 1 - paddle.cos(t_span * 0.5 * paddle.pi)
+        return (
+            self.solve_euler(
+                z,
+                t_span=t_span,
+                mu=mu,
+                mask=mask,
+                spks=spks,
+                cond=cond,
+                streaming=streaming,
+            ),
+            None,
+        )
diff --git a/paddlespeech/t2s/modules/flow/length_regulator.py b/paddlespeech/t2s/modules/flow/length_regulator.py
new file mode 100644
index 0000000000..db6a35818b
--- /dev/null
+++ b/paddlespeech/t2s/modules/flow/length_regulator.py
@@ -0,0 +1,91 @@
+from typing import Tuple
+
+import paddle
+
+from cosyvoice.utils.mask import make_pad_mask
+
+############################## 相关utils函数，如下 ##############################
+
+def _Tensor_max(self, *args, **kwargs):
+    if "other" in kwargs:
+        kwargs["y"] = kwargs.pop("other")
+        ret = paddle.maximum(self, *args, **kwargs)
+    elif len(args) == 1 and isinstance(args[0], paddle.Tensor):
+        ret = paddle.maximum(self, *args, **kwargs)
+    else:
+        if "dim" in kwargs:
+            kwargs["axis"] = kwargs.pop("dim")
+
+        if "axis" in kwargs or len(args) >= 1:
+            ret = paddle.max(self, *args, **kwargs), paddle.argmax(self, *args, **kwargs)
+        else:
+            ret = paddle.max(self, *args, **kwargs)
+
+    return ret
+
+setattr(paddle.Tensor, "_max", _Tensor_max)
+############################## 相关utils函数，如上 ##############################
+
+
+
+class InterpolateRegulator(paddle.nn.Layer):
+    def __init__(
+        self,
+        channels: int,
+        sampling_ratios: Tuple,
+        out_channels: int = None,
+        groups: int = 1,
+    ):
+        super().__init__()
+        self.sampling_ratios = sampling_ratios
+        out_channels = out_channels or channels
+        model = paddle.nn.LayerList(sublayers=[])
+        if len(sampling_ratios) > 0:
+            for _ in sampling_ratios:
+                module = paddle.nn.Conv1d(channels, channels, 3, 1, 1)
+                norm = paddle.nn.GroupNorm(num_groups=groups, num_channels=channels)
+                act = paddle.nn.Mish()
+                model.extend([module, norm, act])
+        model.append(paddle.nn.Conv1d(channels, out_channels, 1, 1))
+        self.model = paddle.nn.Sequential(*model)
+
+    def forward(self, x, ylens=None):
+        mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
+        x = paddle.nn.functional.interpolate(
+            x=x.transpose(1, 2).contiguous(), size=ylens._max(), mode="linear"
+        )
+        out = self.model(x).transpose(1, 2).contiguous()
+        olens = ylens
+        return out * mask, olens
+
+    def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
+        if x2.shape[1] > 40:
+            x2_head = paddle.nn.functional.interpolate(
+                x=x2[:, :20].transpose(1, 2).contiguous(),
+                size=int(20 / input_frame_rate * 22050 / 256),
+                mode="linear",
+            )
+            x2_mid = paddle.nn.functional.interpolate(
+                x=x2[:, 20:-20].transpose(1, 2).contiguous(),
+                size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
+                mode="linear",
+            )
+            x2_tail = paddle.nn.functional.interpolate(
+                x=x2[:, -20:].transpose(1, 2).contiguous(),
+                size=int(20 / input_frame_rate * 22050 / 256),
+                mode="linear",
+            )
+            x2 = paddle.cat([x2_head, x2_mid, x2_tail], dim=2)
+        else:
+            x2 = paddle.nn.functional.interpolate(
+                x=x2.transpose(1, 2).contiguous(), size=mel_len2, mode="linear"
+            )
+        if x1.shape[1] != 0:
+            x1 = paddle.nn.functional.interpolate(
+                x=x1.transpose(1, 2).contiguous(), size=mel_len1, mode="linear"
+            )
+            x = paddle.cat([x1, x2], dim=2)
+        else:
+            x = x2
+        out = self.model(x).transpose(1, 2).contiguous()
+        return out, mel_len1 + mel_len2
\ No newline at end of file
diff --git a/paddlespeech/t2s/modules/tokenizer.py b/paddlespeech/t2s/modules/tokenizer.py
new file mode 100644
index 0000000000..53ea044f51
--- /dev/null
+++ b/paddlespeech/t2s/modules/tokenizer.py
@@ -0,0 +1,241 @@
+import base64
+import os
+from functools import lru_cache
+from paddlenlp.transformers import AutoTokenizer
+import paddle
+import tiktoken
+
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+    "yue": "cantonese",
+    "minnan": "minnan",
+    "wuyu": "wuyu",
+    "dialect": "dialect",
+    "zh/en": "zh/en",
+    "en/zh": "en/zh",
+}
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+    "mandarin": "zh",
+}
+AUDIO_EVENT = {
+    "ASR": "ASR",
+    "AED": "AED",
+    "SER": "SER",
+    "Speech": "Speech",
+    "/Speech": "/Speech",
+    "BGM": "BGM",
+    "/BGM": "/BGM",
+    "Laughter": "Laughter",
+    "/Laughter": "/Laughter",
+    "Applause": "Applause",
+    "/Applause": "/Applause",
+}
+EMOTION = {"HAPPY": "HAPPY", "SAD": "SAD", "ANGRY": "ANGRY", "NEUTRAL": "NEUTRAL"}
+TTS_Vocal_Token = {
+    "TTS/B": "TTS/B",
+    "TTS/O": "TTS/O",
+    "TTS/Q": "TTS/Q",
+    "TTS/A": "TTS/A",
+    "TTS/CO": "TTS/CO",
+    "TTS/CL": "TTS/CL",
+    "TTS/H": "TTS/H",
+    **{f"TTS/SP{i:02d}": f"TTS/SP{i:02d}" for i in range(1, 14)},
+}
+
+
+@lru_cache(maxsize=None)
+def get_encoding(name: str = "gpt2", num_languages: int = 99):
+    vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
+    ranks = {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in open(vocab_path) if line)
+    }
+    n_vocab = len(ranks)
+    special_tokens = {}
+    specials = [
+        "<|endoftext|>",
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
+        *[f"<|{audio_event}|>" for audio_event in list(AUDIO_EVENT.keys())],
+        *[f"<|{emotion}|>" for emotion in list(EMOTION.keys())],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+        *[f"<|SPECIAL_TOKEN_{i}|>" for i in range(1, 31)],
+        *[f"<|{tts}|>" for tts in list(TTS_Vocal_Token.keys())],
+        *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
+    ]
+    for token in specials:
+        special_tokens[token] = n_vocab
+        n_vocab += 1
+    return tiktoken.Encoding(
+        name=os.path.basename(vocab_path),
+        explicit_n_vocab=n_vocab,
+        pat_str="'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
+        mergeable_ranks=ranks,
+        special_tokens=special_tokens,
+    )
+
+
+class QwenTokenizer:
+    def __init__(self, skip_special_tokens=True):
+        super().__init__()
+        special_tokens = {
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "additional_special_tokens": [
+                "<|im_start|>",
+                "<|im_end|>",
+                "<|endofprompt|>",
+                "[breath]",
+                "<strong>",
+                "</strong>",
+                "[noise]",
+                "[laughter]",
+                "[cough]",
+                "[clucking]",
+                "[accent]",
+                "[quick_breath]",
+                "<laughter>",
+                "</laughter>",
+                "[hissing]",
+                "[sigh]",
+                "[vocalized-noise]",
+                "[lipsmack]",
+                "[mn]",
+            ],
+        }
+        self.special_tokens = special_tokens
+        self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+
+    def encode(self, text, **kwargs):
+        tokens = self.tokenizer([text], return_tensors="pd")
+        tokens = tokens["input_ids"][0].cpu().tolist()
+        return tokens
+
+    def decode(self, tokens):
+        tokens = paddle.tensor(tokens, dtype=paddle.int64)
+        text = self.tokenizer.batch_decode(
+            [tokens], skip_special_tokens=self.skip_special_tokens
+        )[0]
+        return text
+
+
+@lru_cache(maxsize=None)
+def get_qwen_tokenizer(skip_special_tokens: bool) -> QwenTokenizer:
+    return QwenTokenizer(skip_special_tokens=skip_special_tokens)
diff --git a/q.pdparams b/q.pdparams
new file mode 100644
index 0000000000..731d141199
Binary files /dev/null and b/q.pdparams differ
diff --git a/q.pt b/q.pt
new file mode 100644
index 0000000000..4d315c18cf
Binary files /dev/null and b/q.pt differ