Merge branch 'main' into release/2.6

Jintao-Huang · Jintao-Huang · commit a2038ca41502 · 2024-11-19T16:03:38.000+08:00
diff --git a/requirements/framework.txt b/requirements/framework.txt
@@ -4,10 +4,12 @@ aiohttp
 attrdict
 binpacking
 dacite
+datasets>=3.0
 einops
 importlib_metadata
 jieba
 matplotlib
+modelscope[datasets]>=1.19
 nltk
 numpy<2.0
 oss2
diff --git a/setup.py b/setup.py
@@ -1,12 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # !/usr/bin/env python
 import os
-import shutil
 from setuptools import find_packages, setup
 from typing import List
 
-from packaging import version
-
 
 def readme():
     with open('README.md', encoding='utf-8') as f:
@@ -118,25 +115,8 @@ def gen_packages_items():
     return gen_packages_items()
 
 
-def add_modelscope_requirement(install_requires: List[str]) -> None:
-    # The future version will remove.
-    try:
-        import modelscope
-        modelscope_version = modelscope.__version__
-    except ImportError:
-        modelscope_version = '1.18'
-
-    if version.parse(modelscope_version) >= version.parse('1.19'):
-        install_requires.append('datasets>=3.0')
-        install_requires.append('modelscope[datasets]>=1.19')
-    else:
-        install_requires.append('datasets<3.0')
-        install_requires.append('modelscope[datasets]>=1.17,<1.19')
-
-
 if __name__ == '__main__':
     install_requires, deps_link = parse_requirements('requirements.txt')
-    add_modelscope_requirement(install_requires)
     extra_requires = {}
     all_requires = []
     extra_requires['llm'], _ = parse_requirements('requirements/llm.txt')
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -1627,18 +1627,20 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
                                                                                                + 1:]
                     added_tokens_len += token_len - 1
                 data.update(media_inputs)
-
-        inputs['input_ids'] = input_ids
+        # The architecture will be optimized in ms-swift3.0
+        data['input_ids'] = input_ids
         inputs['labels'] = labels
-        data['input_ids'] = torch.tensor(input_ids)[None]
         inputs['_data'] = data
+        inputs.update(data)
         return inputs, {}
 
     def _post_encode(self, model, data: Any) -> Dict[str, Any]:
+        if not self._is_training:
+            return data
         _model = model.model
         if not hasattr(_model, 'embed_tokens'):
             _model = _model.model  # LoRA
-        input_ids = data['input_ids']
+        input_ids = torch.tensor(data['input_ids'], device=model.device)[None]
         pixel_values = data.get('pixel_values')
         pixel_values_videos = data.get('pixel_values_videos')
         inputs_embeds = _model.embed_tokens(input_ids)
@@ -1685,10 +1687,6 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
             res['position_ids'] = position_ids.contiguous()
         return res
 
-    @staticmethod
-    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
-        return generate_ids
-
 
 class Qwen2VLTemplate(_Qwen2VLTemplateMixin, QwenTemplate):
     pass
diff --git a/swift/torchacc_utils.py b/swift/torchacc_utils.py
@@ -27,14 +27,26 @@ def get_bucket_sizes(max_length: int) -> List[int]:
     the bucket sizes. If not set, we use a normal distribution bucketing with
     8 buckets.
     """
+    padding_p_base = 2
     if os.getenv('TORCHACC_DATA_BUCKETS') is not None:
         bucket_sizes = [int(x) for x in os.getenv('TORCHACC_DATA_BUCKETS').split(',')]
         bucket_sizes.append(max_length)
-    else:  # default normal distribution bucketing.
-        mean = max_length // 2
-        var = max_length // 8
-        bucket_sizes = [mean + i * var for i in range(-3, 4)]
+    else:
+        if os.getenv('TORCHACC_CACHE_PATH') is not None:  # padding strategy when persistent cache is enabled
+            padding_p_base = 1.4
+        padding_p_base = os.getenv('TORCHACC_PADDING_P_BASE', padding_p_base)
+        try:
+            padding_p_base = float(padding_p_base)
+        except ValueError as e:
+            logger.error(f'Expect TORCHACC_PADDINF_P_BASE to be a float number, but encountered {padding_p_base}')
+            raise e
+        bucket_sizes = [16, 32, 48, 64, 96, 128]
+        base_size = 256
+        while base_size < max_length:
+            bucket_sizes.append((int(base_size) + 127) // 128 * 128)
+            base_size *= padding_p_base
         bucket_sizes.append(max_length)
+
     return bucket_sizes
 
 
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -213,7 +213,9 @@ def compute_loss(self, model, inputs, return_outputs=None, num_items_in_batch=No
                 acc = torch.tensor(acc_list, device=preds.device).float().mean()
             else:
                 if use_torchacc():
-                    ta_trim_graph()
+                    # Only enabled during evaluation/test
+                    if not model.training:
+                        ta_trim_graph()
                     preds = preds.to('cpu')
                     masks = masks.to('cpu')
                     labels = labels.to('cpu')
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
@@ -111,7 +111,7 @@ def _feed_forward_chunk(self, attention_output):
                 setattr(module, f'adapter_{adapter_name}', adapter_module)
                 logger.info(f'Adapter modules(module_key): {module_key}.adapter_{adapter_name}')
 
-        def state_dict_callback(state_dict, adapter_name: str):
+        def state_dict_callback(state_dict, adapter_name: str, **kwargs):
             return {key: value for key, value in state_dict.items() if f'adapter_{adapter_name}' in key}
 
         def mark_trainable_callback(model):
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
@@ -359,29 +359,6 @@ def from_pretrained(cls,
                 continue
             state_dict = cls.load_state_file(sub_folder)
             if state_dict is not None:
-                model_is_qlora = len([
-                    k for k in self.state_dict().keys()
-                    if k.endswith(f'.lora_A.{_adapter}.weight') or k.endswith(f'.lora_B.{_adapter}.weight')
-                ])
-                if not model_is_qlora:
-                    # model is lora, state_dict: qlora->lora
-                    state_dict = {
-                        k[:-len(f'.{_name}.weight') if k.endswith(f'.lora_A.{_name}.weight') or k.
-                          endswith(f'.lora_B.{_name}.weight') else None]: v
-                        for k, v in state_dict.items()
-                    }
-                if any(['loramodule' in key for key in state_dict]):
-                    # Compatible with old checkpoints before ms-swift:1.5.0
-                    state_dict = {
-                        key.replace(f'loramodule_{_name}.lora_A', 'lora_A') if f'loramodule_{_name}.lora_A.{_name}'
-                        in key else key.replace(f'loramodule_{_name}.lora_A', f'lora_A.{_name}.weight'): value
-                        for key, value in state_dict.items()
-                    }
-                    state_dict = {
-                        key.replace(f'loramodule_{_name}.lora_B', 'lora_B') if f'loramodule_{_name}.lora_B.{_name}'
-                        in key else key.replace(f'loramodule_{_name}.lora_B', f'lora_B.{_name}.weight'): value
-                        for key, value in state_dict.items()
-                    }
                 if isinstance(adapter_name, dict):
                     # TODO this logic is fragile! replace `_name` may cause other parts replaced
                     state_dict = {key.replace(_name, adapter_name[_name]): value for key, value in state_dict.items()}
diff --git a/swift/tuners/llamapro.py b/swift/tuners/llamapro.py
@@ -77,7 +77,7 @@ def prepare_model(model: nn.Module, config: LLaMAProConfig, adapter_name: str) -
         model.config.num_hidden_layers = len(new_module_list)
         LLaMAPro._set_module_list(config, model, new_module_list)
 
-        def state_dict_callback(state_dict, adapter_name):
+        def state_dict_callback(state_dict, adapter_name, **kwargs):
             model_key_mapping = LLaMAPro.get_model_key_mapping(config.model_type, config)
             new_module_list = [model_key_mapping.module_list + f'.{i}' for i in new_module_idx]
             return {
diff --git a/swift/tuners/longlora/longlora.py b/swift/tuners/longlora/longlora.py
@@ -51,7 +51,7 @@ def prepare_model(model: nn.Module, config: LongLoRAConfig, adapter_name: str):
         """Prepare a model with `LongLoRAConfig`"""
         LoraModel(model, config, adapter_name)
 
-        def state_dict_callback(state_dict, adapter_name):
+        def state_dict_callback(state_dict, adapter_name, **kwargs):
             _state_dict = lora_state_dict(state_dict, adapter_name, config.bias)
             for name, value in state_dict.items():
                 if isinstance(config.embedder_and_normalizer, str):
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
@@ -81,7 +81,7 @@ def prepare_model(model: nn.Module, config: LoRAConfig, adapter_name: str):
                 config.group_size = getattr(auto_gptq_config, 'group_size', None)
         LoraModel(model, config, adapter_name)
 
-        def state_dict_callback(state_dict, adapter_name, cfg=None):
+        def state_dict_callback(state_dict, adapter_name, cfg=None, **kwargs):
             return lora_state_dict(state_dict, adapter_name, cfg.bias if cfg else config.bias)
 
         def mark_trainable_callback(model, cfg=None):
diff --git a/swift/tuners/neftune.py b/swift/tuners/neftune.py
@@ -49,7 +49,7 @@ def neftune_hook(module, args, output):
                 sub_module.register_forward_hook(neftune_hook)
                 sub_module.nef_activated = True
 
-        def state_dict_callback(state_dict, adapter_name):
+        def state_dict_callback(state_dict, adapter_name, **kwargs):
             return state_dict
 
         def mark_trainable_callback(model):
diff --git a/swift/tuners/part.py b/swift/tuners/part.py
@@ -70,11 +70,14 @@ def _forward(self, *args, **kwargs):
                 setattr(module, f'_part_{adapter_name}', new_module)
                 new_module.requires_grad_(True)
 
-        def state_dict_callback(state_dict, adapter_name):
+        def state_dict_callback(state_dict, adapter_name, **kwargs):
             new_state_dict = {}
             for key, value in state_dict.items():
                 if f'_part_{adapter_name}.' in key:
-                    new_key = key.replace(f'_part_{adapter_name}.', '').replace('base_layer.', '')
+                    if kwargs.get('replace_key', True):
+                        new_key = key.replace(f'_part_{adapter_name}.', '').replace('base_layer.', '')
+                    else:
+                        new_key = key
                     new_state_dict[new_key] = value
 
             return new_state_dict
@@ -90,11 +93,14 @@ def load_state_dict_callback(model: nn.Module, adapter_name: str, state_dict: Di
                     for param_name in state_dict:
                         if param_name.startswith(name):
                             end = param_name[len(name):]
-                            if hasattr(module, 'base_layer'):
-                                new_state_dict[name + f'.base_layer._part_{adapter_name}'
-                                               + end] = state_dict[param_name]
+                            if '_part_' not in param_name:
+                                if hasattr(module, 'base_layer'):
+                                    new_state_dict[name + f'.base_layer._part_{adapter_name}'
+                                                   + end] = state_dict[param_name]
+                                else:
+                                    new_state_dict[name + f'._part_{adapter_name}' + end] = state_dict[param_name]
                             else:
-                                new_state_dict[name + f'._part_{adapter_name}' + end] = state_dict[param_name]
+                                new_state_dict[param_name] = state_dict[param_name]
             return new_state_dict
 
         return SwiftOutput(
diff --git a/swift/tuners/peft.py b/swift/tuners/peft.py
@@ -77,40 +77,22 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional
         return self
 
 
-def _get_target(*args, **kwargs):
-    target = None
-    if 'target' in kwargs:
-        target = kwargs['target']
-    else:
-        for arg in args:
-            if isinstance(arg, torch.nn.Module):
-                target = arg
-                break
-    return target
-
-
-def _create_and_replace_hook(self, *args, **kwargs):
-    target = _get_target(*args, **kwargs)
-    if target and target.__class__.__name__ == 'NonDynamicallyQuantizableLinear':
-        return
-
-    return self._create_and_replace_origin(*args, **kwargs)
-
-
-def _create_and_replace_hook2(self, *args, **kwargs):
-    target = _get_target(*args, **kwargs)
-
+def _create_and_replace_hook(self, peft_config, adapter_name, target, *args, **kwargs):
     all_supported_names = ('linear', )
     all_supported_types = (torch.nn.Embedding, torch.nn.Conv2d, transformers.pytorch_utils.Conv1D)
+    target_modules = getattr(peft_config, 'target_modules', None)
+    if target is None:
+        return
 
-    is_multimodal = getattr(self.model, 'is_multimodal', False)
-
-    if is_multimodal and target is not None and (not any(
+    if isinstance(target_modules, str) and not any(
         [name in target.__class__.__name__.lower()
-         for name in all_supported_names]) and not any([isinstance(target, type) for type in all_supported_types])):
+         for name in all_supported_names]) and not any([isinstance(target, type_) for type_ in all_supported_types]):
         return
 
-    return _create_and_replace_hook(self, *args, **kwargs)
+    if target.__class__.__name__ == 'NonDynamicallyQuantizableLinear':
+        return
+
+    return self._create_and_replace_origin(peft_config, adapter_name, target, *args, **kwargs)
 
 
 def _convert_dtype(target: torch.nn.Module, adapter_name: str, lora_dtype: str):
@@ -296,28 +278,24 @@ def keep_device_forward(self, *args, **kwargs):
 
 def hot_patch_peft_module():
     from peft.tuners.lora import LoraLayer
+    if hasattr('LoraModel', '_create_and_replace_origin'):
+        return
 
     # Fix Lora does not support NonDynamicallyQuantizableLinear
     LoraModel._create_and_replace_origin = LoraModel._create_and_replace
     LoraModel._create_and_replace = _create_and_replace_hook
     VeraModel._create_and_replace_origin = VeraModel._create_and_replace
-    VeraModel._create_and_replace = _create_and_replace_hook2
+    VeraModel._create_and_replace = _create_and_replace_hook
     BOFTModel._create_and_replace_origin = BOFTModel._create_and_replace
-    BOFTModel._create_and_replace = _create_and_replace_hook2
+    BOFTModel._create_and_replace = _create_and_replace_hook
     IA3Model._create_and_replace_origin = IA3Model._create_and_replace
-    IA3Model._create_and_replace = _create_and_replace_hook2
+    IA3Model._create_and_replace = _create_and_replace_hook
     if FourierFTModel is not None:
         FourierFTModel._create_and_replace_origin = FourierFTModel._create_and_replace
-        FourierFTModel._create_and_replace = _create_and_replace_hook2
+        FourierFTModel._create_and_replace = _create_and_replace_hook
 
     # Support type conversion
     def init(self, model: torch.nn.Module, config: Dict[str, LoraConfig], adapter_name):
-        if isinstance(config, dict):
-            for _config in config.values():  # There is a target_modules as a string.
-                if isinstance(getattr(_config, 'target_modules', None), str):
-                    # Make sure the regex can find all linear in the module.
-                    LoraModel._create_and_replace = _create_and_replace_hook2
-                    break
 
         self.__init_origin__(model, config, adapter_name)
         if isinstance(self.active_adapter, list):
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
@@ -126,7 +126,7 @@ def _forward(self, *args, **kwargs):
                 logger.info(f'Prompt modules(module_key): {module_key}.prompt_{adapter_name}')
                 match_module_keys.append(module_key)
 
-        def state_dict_callback(state_dict, adapter_name):
+        def state_dict_callback(state_dict, adapter_name, **kwargs):
             return {key: value for key, value in state_dict.items() if f'prompt_{adapter_name}' in key}
 
         def mark_trainable_callback(model):
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
@@ -233,7 +233,7 @@ def _forward_restuning(self, origin_arg):
         if target_module_ins is None:
             raise Exception('Cannot match target modules')
 
-        def state_dict_callback(state_dict, adapter_name):
+        def state_dict_callback(state_dict, adapter_name, **kwargs):
             return {key: value for key, value in state_dict.items() if f'restuning_{adapter_name}' in key}
 
         def mark_trainable_callback(model):
diff --git a/swift/tuners/rome/rome.py b/swift/tuners/rome/rome.py
@@ -76,7 +76,7 @@ def prepare_model(model: nn.Module, config: RomeConfig, adapter_name: str):
             hparams = ROMEHyperParams.from_name(config.model_type)
             modified_keys = apply_rome_to_model(model, config.tokenizer, config.knowledge, hparams, config.batch_first)
 
-        def state_dict_callback(state_dict, adapter_name):
+        def state_dict_callback(state_dict, adapter_name, **kwargs):
             return {key: value for key, value in state_dict.items() if key in modified_keys}
 
         def mark_trainable_callback(model):
diff --git a/swift/tuners/scetuning/scetuning.py b/swift/tuners/scetuning/scetuning.py
@@ -176,7 +176,7 @@ def _forward_decoder_mode(self, *args, **kwargs):
             if len(hint_module_ins_list) > 0:
                 setattr(t_module, 'hint', hint_module_ins_list[tuner_id])
 
-        def state_dict_callback(state_dict, adapter_name):
+        def state_dict_callback(state_dict, adapter_name, **kwargs):
             state_dict_new = {key: value for key, value in state_dict.items() if f'scetuner_{adapter_name}' in key}
             return state_dict_new
 
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
@@ -107,7 +107,7 @@ def forward_seq(self, input, *args, **kwargs):
                 setattr(tgt_module, f'side_{adapter_name}', side_module)
                 logger.info(f'Side modules(module_key): {module_key}.side_{adapter_name}')
 
-        def state_dict_callback(state_dict, adapter_name):
+        def state_dict_callback(state_dict, adapter_name, **kwargs):
             return {key: value for key, value in state_dict.items() if f'side_{adapter_name}' in key}
 
         def mark_trainable_callback(model):