modelscope
diff --git a/‎README.md‎
Lines changed: 12 additions & 1 deletion b/‎README.md‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎README_CN.md‎
Lines changed: 12 additions & 1 deletion b/‎README_CN.md‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎docs/source/LLM/支持的模型和数据集.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/LLM/支持的模型和数据集.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/LLM/自定义与拓展.md‎
Lines changed: 9 additions & 9 deletions b/‎docs/source/LLM/自定义与拓展.md‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎docs/source/LLM/自我认知微调最佳实践.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/LLM/自我认知微调最佳实践.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/utils/test_readme.py‎
Lines changed: 33 additions & 0 deletions b/‎scripts/utils/test_readme.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎swift/llm/infer.py‎
Lines changed: 35 additions & 11 deletions b/‎swift/llm/infer.py‎
Lines changed: 35 additions & 11 deletions
diff --git a/‎swift/llm/sft.py‎
Lines changed: 1 addition & 2 deletions b/‎swift/llm/sft.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎swift/llm/utils/argument.py‎
Lines changed: 17 additions & 7 deletions b/‎swift/llm/utils/argument.py‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎swift/llm/utils/dataset.py‎
Lines changed: 1 addition & 0 deletions b/‎swift/llm/utils/dataset.py‎
Lines changed: 1 addition & 0 deletions
@@ -15,7 +15,10 @@
 <p align="center">
 <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
 <img src="https://img.shields.io/badge/pytorch-%E2%89%A51.12%20%7C%20%E2%89%A52.0-orange.svg">
-<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.9.3-5D91D4.svg"></a>
+<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.9.5-5D91D4.svg"></a>
+<a href="https://pypi.org/project/ms-swift/"><img src="https://badge.fury.io/py/ms-swift.svg"></a>
+<a href="https://github.com/modelscope/swift/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/swift"></a>
+<a href="https://pepy.tech/project/ms-swift"><img src="https://pepy.tech/badge/ms-swift"></a>
 <a href="https://github.com/modelscope/swift/"><img src="https://img.shields.io/badge/ms--swift-Build from source-6FEBB9.svg"></a>
 </p>
 
@@ -70,6 +73,8 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 - 🔥 2023.11.10: Support for **bluelm** series models: bluelm-7b, bluelm-7b-chat, bluelm-7b-32k, bluelm-7b-chat-32k. The corresponding shell script can be found in [bluelm_7b_chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/bluelm_7b_chat).
 - 🔥 2023.11.08: Support the finetuning of **xverse-65b** model, scripts can be found at: [xverse_65b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/xverse_65b).
 - 🔥 2023.11.07: Support the finetuning of **yi-6b**, **yi-34b** model, scripts can be found at: [yi_6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/yi_6b), [yi_34b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/yi_34b).
+<details><summary>More</summary>
+
 - 🔥 2023.10.30: Support **QA-LoRA** and **LongLoRA** to decrease memory usage in training.
 - 🔥 2023.10.30: Support **ROME**(Rank One Model Editing) to add/modify knowledges, training is not needed!
 - 2023.10.30: Support for **skywork-13b** series models: skywork-13b, skywork-13b-chat. The corresponding shell script can be found in [skywork_13b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/skywork_13b).
@@ -79,6 +84,12 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 - 2023.10.12: Supported **mistral-7b** model series: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-chat.
 - 🔥 2023.10.7: Supported **DeepSpeed ZeRO-2**, enabling LoRA (not just QLoRA) to run DDP on 2*A10.
 - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
+- 🔥 2023.9.25: Supported **qwen-14b** model series: qwen-14b, qwen-14b-chat.
+- 2023.9.18: Supported **internlm-20b** model series: internlm-20b, internlm-20b-chat.
+- 2023.9.12: Supported training with **MP+DDP** to accelerate full-parameter fine-tuning speed.
+- 2023.9.5: Supported **openbuddy-llama2-70b-chat** model.
+- 2023.9.3: Supported **baichuan2** model series: baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat.
+</details>
 
 
 ## ✨ LLM Training and Inference
 
@@ -15,7 +15,10 @@
 <p align="center">
 <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
 <img src="https://img.shields.io/badge/pytorch-%E2%89%A51.12%20%7C%20%E2%89%A52.0-orange.svg">
-<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.9.3-5D91D4.svg"></a>
+<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.9.5-5D91D4.svg"></a>
+<a href="https://pypi.org/project/ms-swift/"><img src="https://badge.fury.io/py/ms-swift.svg"></a>
+<a href="https://github.com/modelscope/swift/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/swift"></a>
+<a href="https://pepy.tech/project/ms-swift"><img src="https://pepy.tech/badge/ms-swift"></a>
 <a href="https://github.com/modelscope/swift/"><img src="https://img.shields.io/badge/ms--swift-Build from source-6FEBB9.svg"></a>
 </p>
 
@@ -68,6 +71,8 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 - 🔥 2023.11.10: 支持**bluelm**系列模型: bluelm-7b, bluelm-7b-chat, bluelm-7b-32k, bluelm-7b-chat-32k. 对应的sh脚本可以查看[bluelm_7b_chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/bluelm_7b_chat).
 - 🔥 2023.11.08: 支持**xverse-65b**模型的训练和推理流程，脚本在[xverse_65b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/xverse_65b).
 - 🔥 2023.11.07: 支持**yi-6b**, **yi-34b**模型的训练和推理流程，脚本在[yi_6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/yi_6b), [yi_34b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/yi_34b).
+<details><summary>更多</summary>
+
 - 🔥 2023.10.30: 支持 **QA-LoRA** 和 **LongLoRA**两种新的tuners.
 - 🔥 2023.10.30: 支持使用**ROME**(Rank One Model Editing)来编辑模型，在无需训练的情况下即可给模型灌注新知识！
 - 2023.10.30: 支持**skywork-13b**系列模型: skywork-13b, skywork-13b-chat. 对应的sh脚本可以查看[skywork_13b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/skywork_13b).
@@ -77,6 +82,12 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 - 2023.10.12: 支持**mistral-7b**系列模型: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-chat.
 - 🔥 2023.10.7: 支持**DeepSpeed ZeRO-2**, 使得lora(不仅仅是qlora)可以在双卡A10上运行DDP.
 - 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
+- 🔥 2023.9.25: Supported **qwen-14b** model series: qwen-14b, qwen-14b-chat.
+- 2023.9.18: Supported **internlm-20b** model series: internlm-20b, internlm-20b-chat.
+- 2023.9.12: Supported training with **MP+DDP** to accelerate full-parameter fine-tuning speed.
+- 2023.9.5: Supported **openbuddy-llama2-70b-chat** model.
+- 2023.9.3: Supported **baichuan2** model series: baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat.
+</details>
 
 
 ## ✨ 大模型训练推理
 
@@ -96,7 +96,7 @@
 - Dataset Name: 数据集在swift中注册的dataset\_name.
 - Dataset ID: 数据集在[ModelScope](https://www.modelscope.cn/my/overview)上的dataset\_id.
 - Size: 数据集中的数据样本数量.
-- Statistic: 数据集的统计量. 我们使用token数进行统计, 这对于调整`max_length`超参数有帮助. 我们将数据集的训练集和验证集进行拼接, 然后进行统计. 我们使用qwen的tokenizer对数据集进行分词. 不同的tokenizer的统计量不同, 如果你要获取其他的模型的tokenizer的token统计量, 可以通过[脚本](https://github.com/modelscope/swift/tree/main/benchmark/run_dataset.py)自行获取.
+- Statistic: 数据集的统计量. 我们使用token数进行统计, 这对于调整`max_length`超参数有帮助. 我们将数据集的训练集和验证集进行拼接, 然后进行统计. 我们使用qwen的tokenizer对数据集进行分词. 不同的tokenizer的统计量不同, 如果你要获取其他的模型的tokenizer的token统计量, 可以通过[脚本](https://github.com/modelscope/swift/tree/main/scripts/utils/run_dataset_info.py)自行获取.
 
 | Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags |
 | ------------ | ---------- | ---------- | -------- | ----------------- | ---- |
 
@@ -30,15 +30,6 @@
 
 **格式1:**
 
-```csv
-instruction,input,output
-11111,22222,33333
-aaaaa,bbbbb,ccccc
-AAAAA,BBBBB,CCCCC
-```
-
-**格式2:**
-
 Pretraining
 
 ```csv
@@ -83,6 +74,15 @@ Multi-Round Dialogue
 {"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]]}]
 ```
 
+**格式2:**
+
+```csv
+instruction,input,output
+11111,22222,33333
+aaaaa,bbbbb,ccccc
+AAAAA,BBBBB,CCCCC
+```
+
 **格式3:**
 
 ```jsonl
 
@@ -25,7 +25,6 @@ pip install -r requirements/llm.txt  -U
 ```
 
 ## 微调前推理
-如果你要进行单样本推理, 可以参考[LLM推理文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E6%8E%A8%E7%90%86%E6%96%87%E6%A1%A3.md#qwen-7b-chat)
 
 使用python:
 ```python
@@ -69,6 +68,7 @@ My name is QianWen, developed by Alibaba Cloud. I am designed to answer various
 如果以上方法都不能帮助你改善睡眠，建议你咨询医生或专业的睡眠治疗师。
 """
 ```
+如果你要进行单样本推理, 可以参考[LLM推理文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E6%8E%A8%E7%90%86%E6%96%87%E6%A1%A3.md#qwen-7b-chat)
 
 使用CLI:
 ```bash
 
@@ -0,0 +1,33 @@
+import os
+import re
+
+import torch
+from modelscope import snapshot_download
+
+from swift.llm import MODEL_MAPPING
+
+
+def test_readme():
+    for model_type in MODEL_MAPPING.keys():
+        model_id = MODEL_MAPPING[model_type]['model_id_or_path']
+        model_dir = snapshot_download(model_id, revision='master')
+        readme_path = os.path.join(model_dir, 'README.md')
+        assert os.path.exists(readme_path)
+        with open(readme_path, 'r') as f:
+            text = f.read()
+
+        code_list = re.findall(r'```python\n(.+?)\n```', text, re.M | re.S)
+        print(f'model_type: {model_type}')
+        for code in code_list:
+            if 'import' not in code or 'modelscope' not in code:
+                continue
+            try:
+                exec(code)
+            except Exception:
+                print(code)
+                input('[ENTER')
+        torch.cuda.empty_cache()
+
+
+if __name__ == '__main__':
+    test_readme()
@@ -7,11 +7,12 @@
 import json
 import torch
 from modelscope import BitsAndBytesConfig, GenerationConfig
+from tqdm import tqdm
 from transformers import PreTrainedModel
 
 from swift.tuners import Swift
 from swift.utils import (append_to_jsonl, get_logger, print_model_info,
-                         seed_everything, show_layers)
+                         read_multi_line, seed_everything, show_layers)
 from .utils import (InferArguments, Template, get_dataset, get_model_tokenizer,
                     get_template, inference, inference_stream)
 
@@ -157,21 +158,35 @@ def llm_infer(args: InferArguments) -> None:
     if args.save_result and args.ckpt_dir is not None:
         time = dt.datetime.now().strftime('%Y%m%d-%H%M%S')
         jsonl_path = os.path.join(args.ckpt_dir, f'infer_result_{time}.jsonl')
+    input_mode: Literal['S', 'M'] = 'S'
     if args.eval_human:
-        print_str = 'Input `exit` to exit the conversation'
+        logger.info('Input `exit` to exit the conversation.')
+        logger.info('Input `multi-line` to switch to multi-line input mode.')
         if template.support_multi_round:
-            print_str += ', input `clear` to clear the history.'
+            logger.info('Input `clear` to clear the history.')
         else:
-            print_str += ', The current template only supports single-round dialogues.'
-        logger.info(print_str)
+            logger.info(
+                'The current template only supports single-round dialogues.')
         history = []
         while True:
-            query = input('<<< ')
+            if input_mode == 'S':
+                query = input('<<< ')
+            else:
+                query = read_multi_line()
             if query.strip().lower() == 'exit':
                 break
             elif query.strip().lower() == 'clear':
                 history = []
                 continue
+            if input_mode == 'S' and query.strip().lower() == 'multi-line':
+                input_mode = 'M'
+                logger.info('End multi-line input with `#`.')
+                logger.info(
+                    'Input `single-line` to switch to single-line input mode.')
+                continue
+            if input_mode == 'M' and query.strip().lower() == 'single-line':
+                input_mode == 'S'
+                continue
             if not template.support_multi_round:
                 history = []
             gen = inference_stream(model, template, query, history)
@@ -198,24 +213,33 @@ def llm_infer(args: InferArguments) -> None:
             val_dataset = val_dataset.select(
                 range(min(args.val_dataset_sample, val_dataset.shape[0])))
         logger.info(f'val_dataset: {val_dataset}')
+        if args.verbose is None:
+            if len(val_dataset) >= 100:
+                args.verbose = False
+            else:
+                args.verbose = True
+            logger.info(f'Setting args.verbose: {args.verbose}')
+        if not args.verbose:
+            val_dataset = tqdm(val_dataset)
         for data in val_dataset:
             _, history = inference(
                 model,
                 template,
                 data.get('query'),
                 data.get('history'),
                 data.get('system'),
-                stream=args.stream,
-                verbose=True)
+                stream=args.stream and args.verbose,
+                verbose=args.verbose)
             label = data.get('response')
             item = history[0]
             obj = {'query': item[0], 'response': item[1], 'label': label}
             if jsonl_path is not None:
                 append_to_jsonl(jsonl_path, obj)
             result.append(obj)
-            print()
-            print(f'[LABELS]{label}')
-            print('-' * 50)
+            if args.verbose:
+                print()
+                print(f'[LABELS]{label}')
+                print('-' * 50)
     if args.save_result and args.ckpt_dir is not None:
         logger.info(f'save_result_path: {jsonl_path}')
     return {'result': result}
@@ -141,8 +141,7 @@ def llm_sft(args: SftArguments) -> str:
     if val_dataset is not None and val_dataset_sample is not None and val_dataset_sample >= 0:
         if val_dataset.shape[0] > val_dataset_sample:
             logger.info(f'val_dataset_sample: {val_dataset_sample}')
-            val_idxs = random_state.permutation(val_dataset_sample)
-            val_dataset = val_dataset.select(val_idxs)
+            val_dataset = val_dataset.select(range(val_dataset_sample))
     # add self-cognition dataset
     if args.self_cognition_sample > 0:
         train_dataset = add_self_cognition_dataset(train_dataset,
 
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import math
 import os
 from dataclasses import dataclass, field
 from typing import List, Optional, Set, Tuple, Union
@@ -101,7 +102,7 @@ class SftArguments:
     optim: str = 'adamw_torch'
     learning_rate: Optional[float] = None
     weight_decay: float = 0.01
-    gradient_accumulation_steps: int = 16
+    gradient_accumulation_steps: Optional[int] = None
     max_grad_norm: float = 0.5
     predict_with_generate: bool = False
     lr_scheduler_type: str = 'cosine'
@@ -186,8 +187,9 @@ def __post_init__(self) -> None:
                 logger.info(f'output_dir: {self.output_dir}')
 
         self.torch_dtype, self.fp16, self.bf16 = select_dtype(self)
+        world_size = 1
         if is_dist():
-            rank, local_rank, _, _ = get_dist_setting()
+            rank, local_rank, world_size, _ = get_dist_setting()
             torch.cuda.set_device(local_rank)
             self.seed += rank  # Avoid the same dropout
             if self.ddp_backend == 'gloo' and self.quantization_bit != 0:
@@ -267,6 +269,9 @@ def __post_init__(self) -> None:
             self.logging_dir = f'{self.output_dir}/runs'
         if self.report_to is None:
             self.report_to == ['all']
+        if self.gradient_accumulation_steps is None:
+            self.gradient_accumulation_steps = math.ceil(16 / self.batch_size
+                                                         / world_size)
 
 
 @dataclass
@@ -334,6 +339,7 @@ class InferArguments:
     stream: bool = True
     merge_lora_and_save: bool = False
     overwrite_generation_config: bool = False
+    verbose: Optional[bool] = None
     # compatibility
     show_dataset_sample: int = 10
 
@@ -485,11 +491,15 @@ def set_model_type(args: Union[SftArguments, InferArguments]) -> None:
         model_id_or_path = args.model_id_or_path
         model_id_or_path_lower = model_id_or_path.lower()
         if model_id_or_path_lower not in model_mapping_reversed:
-            error_msg = f"`model_id_or_path`: '{model_id_or_path}' is not registered."
-            if os.path.exists(model_id_or_path):
-                error_msg += (
-                    ' Please use `--model_id_or_path <model_id> --model_cache_dir <local_path>` '
-                    'to specify the local cache path for the model.')
+            if isinstance(args,
+                          InferArguments) and 'checkpoint' in model_id_or_path:
+                error_msg = 'Please use `--ckpt_dir vx_xxx/checkpoint-xxx` to use the checkpoint.'
+            else:
+                error_msg = f"`model_id_or_path`: '{model_id_or_path}' is not registered."
+                if os.path.exists(model_id_or_path):
+                    error_msg += (
+                        ' Please use `--model_id_or_path <model_id> --model_cache_dir <local_path>` '
+                        'to specify the local cache path for the model.')
             raise ValueError(error_msg)
         args.model_type = model_mapping_reversed[model_id_or_path_lower]
 
 
@@ -912,6 +912,7 @@ def get_dataset(
             train_subset_split_list=dataset_info['train_subset_split_list'],
             val_subset_split_list=dataset_info['val_subset_split_list'],
             preprocess_func=dataset_info['preprocess_func'])
+        train_d: HfDataset
         if isinstance(dataset, (list, tuple)):
             train_d, val_d = dataset
         else: