FunAudioLLM
diff --git a/‎cosyvoice/bin/inference.py‎ renamed to ‎cosyvoice/bin/inference_deprecated.py‎
Lines changed: 1 addition & 0 deletions b/‎cosyvoice/bin/inference.py‎ renamed to ‎cosyvoice/bin/inference_deprecated.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cosyvoice/bin/train.py‎
Lines changed: 23 additions & 3 deletions b/‎cosyvoice/bin/train.py‎
Lines changed: 23 additions & 3 deletions
diff --git a/‎cosyvoice/bin/train_dpo.py‎
Lines changed: 0 additions & 187 deletions b/‎cosyvoice/bin/train_dpo.py‎
Lines changed: 0 additions & 187 deletions
diff --git a/‎cosyvoice/cli/model.py‎
Lines changed: 2 additions & 1 deletion b/‎cosyvoice/cli/model.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cosyvoice/dataset/dataset.py‎
Lines changed: 5 additions & 18 deletions b/‎cosyvoice/dataset/dataset.py‎
Lines changed: 5 additions & 18 deletions
@@ -122,4 +122,5 @@ def main():
 
 
 if __name__ == '__main__':
+    logging.warning('this code has been deprecated, please refer to README for CosyVoice inference usage!')
     main()
@@ -27,6 +27,7 @@
 
 from torch.distributed.elastic.multiprocessing.errors import record
 
+from cosyvoice.utils.losses import DPOLoss
 from cosyvoice.utils.executor import Executor
 from cosyvoice.utils.train_utils import (
     init_distributed,
@@ -43,6 +44,7 @@ def get_args():
                         choices=['torch_ddp', 'deepspeed'],
                         help='Engine for paralleled training')
     parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--ref_model', required=False, help='ref model used in dpo')
     parser.add_argument('--config', required=True, help='config file')
     parser.add_argument('--train_data', required=True, help='train data file')
     parser.add_argument('--cv_data', required=True, help='cv data file')
@@ -73,6 +75,10 @@ def get_args():
                         action='store_true',
                         default=False,
                         help='Use automatic mixed precision training')
+    parser.add_argument('--dpo',
+                        action='store_true',
+                        default=False,
+                        help='Use Direct Preference Optimization')
     parser.add_argument('--deepspeed.save_states',
                         dest='save_states',
                         default='model_only',
@@ -113,7 +119,7 @@ def main():
 
     # Get dataset & dataloader
     train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
-        init_dataset_and_dataloader(args, configs, gan)
+        init_dataset_and_dataloader(args, configs, gan, args.dpo)
 
     # Do some sanity checks and save config to arsg.model_dir
     configs = check_modify_and_save_config(args, configs)
@@ -122,6 +128,8 @@ def main():
     writer = init_summarywriter(args)
 
     # load checkpoint
+    if args.dpo is True:
+        configs[args.model].forward = configs[args.model].forward_dpo
     model = configs[args.model]
     start_step, start_epoch = 0, -1
     if args.checkpoint is not None:
@@ -150,13 +158,25 @@ def main():
     info_dict['epoch'] = start_epoch
     save_model(model, 'init', info_dict)
 
+    # DPO related
+    if args.dpo is True:
+        ref_model = deepcopy(configs[args.model])
+        state_dict = torch.load(args.ref_model, map_location='cpu')
+        ref_model.load_state_dict(state_dict, strict=False)
+        dpo_loss = DPOLoss(beta=0.01, label_smoothing=0.0, ipo=False)
+        # NOTE maybe it is not needed to wrap ref_model as ddp because its parameter is not updated
+        ref_model = wrap_cuda_model(args, ref_model)
+    else:
+        ref_model, dpo_loss = None, None
+
     # Get executor
-    executor = Executor(gan=gan)
+    executor = Executor(gan=gan, ref_model=ref_model, dpo_loss=dpo_loss)
     executor.step = start_step
 
     # Init scaler, used for pytorch amp mixed precision training
     scaler = torch.cuda.amp.GradScaler() if args.use_amp else None
     print('start step {} start epoch {}'.format(start_step, start_epoch))
+
     # Start training loop
     for epoch in range(start_epoch + 1, info_dict['max_epoch']):
         executor.epoch = epoch
@@ -167,7 +187,7 @@ def main():
             executor.train_one_epoc_gan(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader,
                                         writer, info_dict, scaler, group_join)
         else:
-            executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join)
+            executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=ref_model)
         dist.destroy_process_group(group_join)
 
 
 
@@ -103,7 +103,7 @@ def get_trt_kwargs(self):
     def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
         with self.llm_context, torch.cuda.amp.autocast(self.fp16 is True and hasattr(self.llm, 'vllm') is False):
             if isinstance(text, Generator):
-                assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
+                assert isinstance(self, CosyVoice2Model) and not hasattr(self.llm, 'vllm'), 'streaming input text is only implemented for CosyVoice2 and do not support vllm!'
                 for i in self.llm.inference_bistream(text=text,
                                                      prompt_text=prompt_text.to(self.device),
                                                      prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
@@ -279,6 +279,7 @@ def load_vllm(self, model_dir):
                                  enable_prompt_embeds=True,
                                  gpu_memory_utilization=0.2)
         self.llm.vllm = LLMEngine.from_engine_args(engine_args)
+        self.llm.lock = threading.Lock()
         del self.llm.llm.model.model.layers
 
     def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
 
@@ -14,14 +14,13 @@
 # limitations under the License.
 
 import random
-import json
 import math
 from functools import partial
 
 import torch
 import torch.distributed as dist
 from torch.utils.data import IterableDataset
-from cosyvoice.utils.file_utils import read_lists, read_json_lists
+from cosyvoice.utils.file_utils import read_lists
 
 
 class Processor(IterableDataset):
@@ -127,10 +126,9 @@ def Dataset(data_list_file,
             data_pipeline,
             mode='train',
             gan=False,
+            dpo=False,
             shuffle=True,
-            partition=True,
-            tts_file='',
-            prompt_utt2data=''):
+            partition=True):
     """ Construct dataset from arguments
 
         We have two shuffle stage in the Dataset. The first is global
@@ -142,23 +140,12 @@ def Dataset(data_list_file,
             tokenizer (BaseTokenizer): tokenizer to tokenize
             partition(bool): whether to do data partition in terms of rank
     """
-    assert mode in ['train', 'inference']
     lists = read_lists(data_list_file)
-    if mode == 'inference':
-        with open(tts_file) as f:
-            tts_data = json.load(f)
-        utt2lists = read_json_lists(prompt_utt2data)
-        # filter unnecessary file in inference mode
-        lists = list({utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists})
     dataset = DataList(lists,
                        shuffle=shuffle,
                        partition=partition)
-    if mode == 'inference':
-        # map partial arg to parquet_opener func in inference mode
-        data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
-    if gan is True:
-        # map partial arg to padding func in gan mode
-        data_pipeline[-1] = partial(data_pipeline[-1], gan=gan)
+    # map partial arg to padding func
+    data_pipeline[-1] = partial(data_pipeline[-1], gan=gan, dpo=dpo)
     for func in data_pipeline:
         dataset = Processor(dataset, func, mode=mode)
     return dataset
Original file line number	Diff line number	Diff line change
`@@ -122,4 +122,5 @@ def main():`
`122`	`122`
`123`	`123`
`124`	`124`	`if __name__ == '__main__':`
	`125`	`+ logging.warning('this code has been deprecated, please refer to README for CosyVoice inference usage!')`
`125`	`126`	`main()`