Fix qwen-audio inference bug (#204)

Jintao-Huang · Jintao-Huang · commit da4baf543bbb · 2023-12-07T20:02:22.000+08:00
diff --git a/docs/source/LLM/LLM推理文档.md b/docs/source/LLM/LLM推理文档.md
@@ -274,24 +274,24 @@ template = get_template(template_type, tokenizer)
 
 seed_everything(42)
 query = tokenizer.from_list_format([
-    {'audio': 'demo.wav'},
-    {'text': '请将语音转成文本'},
+    {'audio': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/1272-128104-0000.flac'},
+    {'text': 'what does the person say?'},
 ])
 response, history = inference(model, template, query)
 print(f'query: {query}')
 print(f'response: {response}')
-query = '这句话一般在什么语境下使用'
+query = 'Find the start time and end time of the word "middle classes'
 response, history = inference(model, template, query, history)
 print(f'query: {query}')
 print(f'response: {response}')
 print(f'history: {history}')
-"""
-query: Audio 1:<audio>demo.wav</audio>
-请将语音转成文本
-response: 好的，这是转成的文本："每一天都要快乐哦"。
-query: 这句话一般在什么语境下使用
-response: 这句话一般在表达祝福或者鼓励的时候使用，比如在朋友或者亲人过生日的时候说"每一天都要快乐哦"，表达祝福的意思。
-history: [('Audio 1:<audio>demo.wav</audio>\n请将语音转成文本', '好的，这是转成的文本："每一天都要快乐哦"。'), ('这句话一般在什么语境下使用', '这句话一般在表达祝福或者鼓励的时候使用，比如在朋友或者亲人过生日的时候说"每一天都要快乐哦"，表达祝福的意思。')]
+"""Out[0]
+query: Audio 1:<audio>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/1272-128104-0000.flac</audio>
+what does the person say?
+response: The person says: "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel".
+query: Find the start time and end time of the word "middle classes
+response: The word "middle classes" starts at <|2.33|> seconds and ends at <|3.26|> seconds.
+history: [('Audio 1:<audio>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/1272-128104-0000.flac</audio>\nwhat does the person say?', 'The person says: "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel".'), ('Find the start time and end time of the word "middle classes', 'The word "middle classes" starts at <|2.33|> seconds and ends at <|3.26|> seconds.')]
 """
 ```
 
diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -86,13 +86,14 @@
 - `--template_type`: 默认值为`'AUTO'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--ckpt_dir`: 必填项, 值为SFT阶段保存的checkpoint路径, e.g. `'/path/to/your/vx_xxx/checkpoint-xxx'`.
 - `--load_args_from_ckpt_dir`: 是否从`ckpt_dir`的`sft_args.json`文件中读取配置信息. 默认是`True`.
-- `--eval_human`: 使用数据集中的验证集部分进行评估还是使用人工的方式评估, 默认值为`False`.
+- `--load_dataset_config`: 该参数只有在`--load_args_from_ckpt_dir true`时才生效. 即是否从`ckpt_dir`的`sft_args.json`文件中读取数据集相关的配置信息. 默认为`True`.
+- `--eval_human`: 使用数据集中的验证集部分进行评估还是使用人工的方式评估. 默认值为`None`, 如有传入数据集, 则设置为True, 否则设置为False.
 - `--seed`: 默认值为`42`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--dtype`: 默认值为`'AUTO`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
-- `--dataset`: 默认值为`'blossom-math-zh'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数只有在`eval_human`设置为False时才生效.
-- `--dataset_seed`: 默认值为`42`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数只有在`eval_human`设置为False时才生效.
-- `--dataset_test_ratio`: 默认值为`0.01`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数只有在`eval_human`设置为False时才生效.
-- `--val_dataset_sample`: 表示想要评估和展示的验证集的数量, 默认值为`10`. 该参数只有在`eval_human`设置为False时才生效.
+- `--dataset`: 默认值为`'blossom-math-zh'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数在`eval_human`设置为True时不生效.
+- `--dataset_seed`: 默认值为`42`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数在`eval_human`设置为True时不生效.
+- `--dataset_test_ratio`: 默认值为`0.01`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数在`eval_human`设置为True时不生效.
+- `--val_dataset_sample`: 表示想要评估和展示的验证集的数量, 默认值为`10`. 该参数在`eval_human`设置为True时不生效.
 - `--system`: 默认值为`None`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--max_length`: 默认值为`2048`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--truncation_strategy`: 默认是`'delete'`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
@@ -161,7 +161,7 @@ def llm_infer(args: InferArguments) -> None:
         jsonl_path = os.path.join(args.ckpt_dir, f'infer_result_{time}.jsonl')
     if args.eval_human:
         input_mode: Literal['S', 'M'] = 'S'
-        logger.info('Input `exit` to exit the conversation.')
+        logger.info('Input `exit` or `quit` to exit the conversation.')
         logger.info('Input `multi-line` to switch to multi-line input mode.')
         if template.support_multi_round:
             logger.info('Input `clear` to clear the history.')
@@ -174,7 +174,7 @@ def llm_infer(args: InferArguments) -> None:
                 query = input('<<< ')
             else:
                 query = read_multi_line()
-            if query.strip().lower() == 'exit':
+            if query.strip().lower() in {'exit', 'quit'}:
                 break
             elif query.strip().lower() == 'clear':
                 history = []
@@ -186,7 +186,7 @@ def llm_infer(args: InferArguments) -> None:
                     'Input `single-line` to switch to single-line input mode.')
                 continue
             if input_mode == 'M' and query.strip().lower() == 'single-line':
-                input_mode == 'S'
+                input_mode = 'S'
                 continue
             if not template.support_multi_round:
                 history = []
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -267,7 +267,7 @@ def __post_init__(self) -> None:
         if self.logging_dir is None:
             self.logging_dir = f'{self.output_dir}/runs'
         if self.report_to is None:
-            self.report_to == ['all']
+            self.report_to = ['all']
         if self.gradient_accumulation_steps is None:
             self.gradient_accumulation_steps = math.ceil(16 / self.batch_size
                                                          / world_size)
@@ -296,7 +296,7 @@ class InferArguments:
         default=None, metadata={'help': '/path/to/your/vx_xxx/checkpoint-xxx'})
     load_args_from_ckpt_dir: bool = True
     load_dataset_config: bool = True
-    eval_human: bool = False  # False: eval val_dataset
+    eval_human: Optional[bool] = None  # False: eval val_dataset
 
     seed: int = 42
     dtype: str = field(
@@ -363,17 +363,22 @@ def __post_init__(self) -> None:
         if self.template_type == 'AUTO':
             self.template_type = get_default_template_type(self.model_type)
             logger.info(f'Setting template_type: {self.template_type}')
-        if not self.eval_human:
-            if isinstance(self.dataset, str):
-                self.dataset = [self.dataset]
-            elif self.dataset is None:
-                self.dataset = []
-            if len(self.dataset) == 0:
-                if (len(self.custom_train_dataset_path) == 0
-                        and len(self.custom_val_dataset_path) == 0):
-                    raise ValueError(
-                        f'self.dataset: {self.dataset}. Please set `--eval_human true` or `--dataset xxx`'
-                    )
+        if isinstance(self.dataset, str):
+            self.dataset = [self.dataset]
+        elif self.dataset is None:
+            self.dataset = []
+        if (len(self.dataset) == 0 and len(self.custom_train_dataset_path) == 0
+                and len(self.custom_val_dataset_path) == 0):
+            if self.eval_human is None:
+                self.eval_human = True
+                logger.info(f'Setting self.eval_human: {self.eval_human}')
+            if not self.eval_human:
+                raise ValueError(
+                    f'self.dataset: {self.dataset}. Please set `--eval_human true` or `--dataset xxx`'
+                )
+        elif self.eval_human is None:
+            self.eval_human = False
+            logger.info(f'Setting self.eval_human: {self.eval_human}')
 
         self.bnb_4bit_compute_dtype, self.load_in_4bit, self.load_in_8bit = select_bnb(
             self)
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -801,6 +801,19 @@ def fix_qwen_inplace_bug(model) -> None:
             first_drop.__old_forward = __old_forward
 
 
+def _qwen_vl_audio_decode(self,
+                          *args,
+                          skip_special_tokens=False,
+                          **kwargs) -> str:
+    if skip_special_tokens:
+        token_ids = kwargs['token_ids']
+        while len(token_ids) > 0 and token_ids[-1] in {151645, 151643}:
+            token_ids.pop()
+        return self._old_decode(*args, skip_special_tokens=False, **kwargs)
+    else:
+        return self._old_decode(*args, skip_special_tokens=False, **kwargs)
+
+
 @register_model(
     ModelType.qwen_vl_chat,
     'qwen/Qwen-VL-Chat',
@@ -838,21 +851,9 @@ def get_model_tokenizer_qwen_vl(model_dir: str,
                                          load_model, **kwargs)
     if model is not None:
         fix_qwen_inplace_bug(model)
-
-    _old_decode = tokenizer._decode
-
-    def _new_decode(*args, skip_special_tokens=False, **kwargs) -> str:
-        if skip_special_tokens:
-            token_ids = kwargs['token_ids']
-            while len(token_ids) > 0 and token_ids[-1] in {151645, 151643}:
-                token_ids.pop()
-            return _old_decode(*args, skip_special_tokens=False, **kwargs)
-        else:
-            return _old_decode(*args, skip_special_tokens=False, **kwargs)
-
     if not hasattr(tokenizer, '_old_decode'):  # avoid double patching
-        tokenizer._old_decode = _old_decode
-        tokenizer._decode = _new_decode
+        tokenizer._old_decode = tokenizer._decode
+        tokenizer._decode = MethodType(_qwen_vl_audio_decode, tokenizer)
 
     return model, tokenizer
 
@@ -888,6 +889,10 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
                                          load_model, **kwargs)
     if model is not None:
         fix_qwen_inplace_bug(model)
+    if not hasattr(tokenizer, '_old_decode'):  # avoid double patching
+        tokenizer._old_decode = tokenizer._decode
+        tokenizer._decode = MethodType(_qwen_vl_audio_decode, tokenizer)
+
     return model, tokenizer
 
 
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -2,6 +2,7 @@
 from copy import deepcopy
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
+import torch
 from torch import Tensor
 from transformers import PreTrainedTokenizerBase, StoppingCriteria
 
@@ -131,8 +132,15 @@ def _encode_context_list(
         elif isinstance(context, str):
             if (getattr(tokenizer, 'model_type', '').startswith('qwen-audio')):
                 audio_info = get_audio_info(tokenizer, context=context)
-                assert 'audio_info' not in kwargs
-                kwargs['audio_info'] = audio_info
+                old_audio_info = kwargs.get('audio_info')
+                if old_audio_info is None:
+                    kwargs['audio_info'] = audio_info
+                elif audio_info is not None:
+                    for k in ['input_audios', 'input_audio_lengths']:
+                        old_audio_info[k] = torch.concat(
+                            [old_audio_info[k], audio_info[k]], dim=0)
+                    for k in ['audio_span_tokens', 'audio_urls']:
+                        old_audio_info[k] = old_audio_info[k] + audio_info[k]
             token_list = tokenizer(
                 context,
                 return_attention_mask=False,