update arguments (#1043)

Jintao-Huang · huangjintao · web-flow · commit 4e18bffb2dad · 2024-06-02T14:32:43.000+08:00
Co-authored-by: huangjintao &lt;huangjintao@163.com&gt;
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -9,33 +9,60 @@ Swift DOCUMENTATION
    :maxdepth: 2
    :caption: Get Started
 
-   GetStarted/快速使用.md
    GetStarted/SWIFT安装.md
    GetStarted/界面训练推理.md
    GetStarted/使用tuners.md
    GetStarted/ResTuning.md
+   GetStarted/SCEdit.md
    GetStarted/在SWIFT内使用PEFT.md
-   GetStarted/部署指南.md
 
 .. toctree::
    :maxdepth: 2
-   :caption: LLM Training and Inference Example
+   :caption: LLM Training and Inference
 
-   LLM/自我认知微调最佳实践.md
-   LLM/Agent微调最佳实践.md
    LLM/LLM推理文档.md
    LLM/LLM微调文档.md
    LLM/DPO训练文档.md
-   LLM/ORPO算法最佳实践.md
+   LLM/LLM评测文档.md
+   LLM/LLM量化文档.md
    LLM/VLLM推理加速与部署.md
+   LLM/LLM实验文档.md
+   LLM/命令行参数.md
    LLM/支持的模型和数据集.md
    LLM/自定义与拓展.md
-   LLM/命令行参数.md
+   LLM/自我认知微调最佳实践.md
+   LLM/Agent微调最佳实践.md
+   LLM/Qwen1.5全流程最佳实践.md
+   LLM/NPU推理与微调最佳实践.md
+   LLM/Grok训练和推理.md
+   LLM/ORPO算法最佳实践.md
+   LLM/SimPO算法最佳实践.md
+   LLM/HuggingFace生态兼容.md
    LLM/Benchmark.md
 
 .. toctree::
    :maxdepth: 2
-   :caption: AIGC Training and Inference Example
+   :caption: Multi-Modal LLM Training and Inference
+
+   Multi-Modal/qwen-vl最佳实践.md
+   Multi-Modal/qwen-audio最佳实践.md
+   Multi-Modal/deepseek-vl最佳实践.md
+   Multi-Modal/internlm-xcomposer2最佳实践.md
+   Multi-Modal/phi3-vision最佳实践.md
+   Multi-Modal/llava最佳实践.md
+   Multi-Modal/yi-vl最佳实践.md
+   Multi-Modal/mplug-owl2最佳实践.md
+   Multi-Modal/cogvlm最佳实践.md
+   Multi-Modal/cogvlm2最佳实践.md
+   Multi-Modal/minicpm-v最佳实践.md
+   Multi-Modal/minicpm-v-2最佳实践.md
+   Multi-Modal/minicpm-v-2.5最佳实践.md
+   Multi-Modal/internvl最佳实践.md
+   Multi-Modal/MLLM部署文档.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: AIGC Training and Inference
 
    AIGC/AnimateDiff微调推理文档.md
 
diff --git a/docs/source_en/index.rst b/docs/source_en/index.rst
@@ -12,28 +12,54 @@ Swift DOCUMENTATION
    GetStarted/Installation.md
    GetStarted/Web-ui.md
    GetStarted/Tuners.md
-   GetStarted/SCEdit.md
    GetStarted/ResTuning.md
+   GetStarted/SCEdit.md
    GetStarted/Use-PEFT.md
 
 .. toctree::
    :maxdepth: 2
-   :caption: LLM Training and Inference Example
+   :caption: LLM Training and Inference
 
-   LLM/Self-cognition-best-practice.md
-   LLM/Agent-best-practice.md
    LLM/LLM-fine-tuning.md
    LLM/LLM-inference.md
    LLM/DPO.md
+   LLM/LLM-eval.md
+   LLM/LLM-quantization.md
    LLM/VLLM-inference-acceleration-and-deployment.md
+   LLM/LLM-exp.md
+   LLM/Command-line-parameters.md
    LLM/Supported-models-datasets.md
    LLM/Customization.md
-   LLM/Command-line-parameters.md
+   LLM/Self-cognition-best-practice.md
+   LLM/Agent-best-practice.md
+   LLM/Qwen1.5-best-practice.md
+   LLM/Grok-1-best-practice.md
+   LLM/ORPO.md
+   LLM/SimPO.md
+   LLM/Compat-HF.md
    LLM/Benchmark.md
 
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Multi-Modal LLM Training and Inference
+
+   Multi-Modal/qwen-vl-best-practice.md
+   Multi-Modal/qwen-audio-best-practice.md
+   Multi-Modal/deepseek-vl-best-practice.md
+   Multi-Modal/internlm-xcomposer2-best-practice.md
+   Multi-Modal/phi3-vision-best-practice.md
+   Multi-Modal/llava-best-practice.md
+   Multi-Modal/yi-vl-best-practice.md
+   Multi-Modal/cogvlm-best-practice.md
+   Multi-Modal/cogvlm2-best-practice.md
+   Multi-Modal/minicpm-v-best-practice.md
+   Multi-Modal/internvl-best-practice.md
+   Multi-Modal/mutlimodal-deployment.md
+
 .. toctree::
    :maxdepth: 2
-   :caption: AIGC Training and Inference Example
+   :caption: AIGC Training and Inference
 
    AIGC/AnimateDiff-train-infer.md
 
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -22,7 +22,8 @@
 from swift.tuners import Swift
 from swift.utils import (add_version_to_work_dir, get_dist_setting, get_logger, get_pai_tensorboard_dir, is_dist,
                          is_local_master, is_mp, is_pai_training_job, use_torchacc)
-from .dataset import DATASET_MAPPING, _dataset_name_exists, get_dataset, register_dataset_info_file, sample_dataset
+from .dataset import (DATASET_MAPPING, _dataset_name_exists, get_dataset, parse_dataset_name,
+                      register_dataset_info_file, sample_dataset)
 from .model import (MODEL_MAPPING, dtype_mapping, get_additional_saved_files, get_default_lora_target_modules,
                     get_default_template_type)
 from .template import TEMPLATE_MAPPING
@@ -271,9 +272,18 @@ def handle_custom_dataset_info(self):
     def _handle_dataset_sample(self):
         # compatibility. (Deprecated)
         # Avoid post-processing
-        if len(self.dataset) == 1 and '#' not in self.dataset[0] and self.train_dataset_sample >= 0:
-            self.dataset[0] = f'{self.dataset[0]}#{self.train_dataset_sample}'
-            self.train_dataset_sample = -1
+        if len(self.dataset) != 1 or self.train_dataset_sample == -1:
+            return
+        _dataset = self.dataset[0]
+        train_sample = parse_dataset_name(_dataset)[3]
+        if train_sample is None:
+            train_sample = self.train_dataset_sample
+        elif self.train_dataset_sample < train_sample:
+            train_sample = self.train_dataset_sample
+        _dataset = _dataset[:_dataset.find('#')]
+        _dataset = f'{_dataset}#{train_sample}'
+        self.dataset[0] = _dataset
+        self.train_dataset_sample = -1
 
     def _register_self_cognition(self: Union['SftArguments', 'InferArguments']) -> None:
 
@@ -688,11 +698,9 @@ def _prepare_modules_to_save(self, modules_to_save) -> List[str]:
 
     def __post_init__(self) -> None:
         self.handle_compatibility()
-        self._register_self_cognition()
         if len(self.val_dataset) > 0:
             self.dataset_test_ratio = 0.0
             logger.info('Using val_dataset, ignoring dataset_test_ratio')
-        self._handle_dataset_sample()
         if is_pai_training_job():
             self._handle_pai_compat()
         ds_config_folder = os.path.abspath(os.path.join(__file__, '..', '..', 'ds_config'))
@@ -707,6 +715,8 @@ def __post_init__(self) -> None:
                 break
 
         self.handle_path()
+        self._handle_dataset_sample()
+        self._register_self_cognition()
         self.handle_custom_register()
         self.handle_custom_dataset_info()
         self.set_model_type()
@@ -1059,7 +1069,6 @@ def __post_init__(self) -> None:
             logger.warning(f'The checkpoint dir {self.ckpt_dir} passed in is invalid, please make sure'
                            'the dir contains a `configuration.json` file.')
         self.handle_compatibility()
-        self._register_self_cognition()
         if len(self.val_dataset) > 0:
             self.dataset_test_ratio = 0.0
             logger.info('Using val_dataset, ignoring dataset_test_ratio')
@@ -1073,6 +1082,7 @@ def __post_init__(self) -> None:
         else:
             assert self.load_dataset_config is False, 'You need to first set `--load_args_from_ckpt_dir true`.'
         self._handle_dataset_sample()
+        self._register_self_cognition()
         self.handle_custom_register()
         self.handle_custom_dataset_info()
         self.set_model_type()