modelscope
diff --git a/‎examples/pytorch/llm/scripts/qwen_72b_chat/torchacc/full_fsdp_sft.sh‎
Lines changed: 33 additions & 0 deletions b/‎examples/pytorch/llm/scripts/qwen_72b_chat/torchacc/full_fsdp_sft.sh‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎examples/pytorch/llm/scripts/qwen_72b_chat/torchacc/lora_fsdp_sft.sh‎
Lines changed: 29 additions & 0 deletions b/‎examples/pytorch/llm/scripts/qwen_72b_chat/torchacc/lora_fsdp_sft.sh‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎swift/llm/accelerator.py‎
Lines changed: 33 additions & 0 deletions b/‎swift/llm/accelerator.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎swift/llm/sft.py‎
Lines changed: 48 additions & 5 deletions b/‎swift/llm/sft.py‎
Lines changed: 48 additions & 5 deletions
diff --git a/‎swift/llm/tuner.py‎
Lines changed: 14 additions & 1 deletion b/‎swift/llm/tuner.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎swift/llm/utils/argument.py‎
Lines changed: 6 additions & 0 deletions b/‎swift/llm/utils/argument.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎swift/llm/utils/model.py‎
Lines changed: 2 additions & 2 deletions b/‎swift/llm/utils/model.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎swift/llm/utils/template.py‎
Lines changed: 9 additions & 1 deletion b/‎swift/llm/utils/template.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎swift/llm/utils/utils.py‎
Lines changed: 5 additions & 1 deletion b/‎swift/llm/utils/utils.py‎
Lines changed: 5 additions & 1 deletion
@@ -0,0 +1,33 @@
+# Experimental environment: 4 * 8*A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.97
+
+# Note: You need to set the correct MASTER_ADDR, MASTER_PORT and NODE_RANK for each node.
+
+MASTER_ADDR=127.0.0.1 \
+MASTER_PORT=12456 \
+NODE_RANK=0 \
+NNODES=4 \
+NPROC_PER_NODE=8 \
+swift sft \
+    --model_type qwen-72b-chat \
+    --model_layer_cls_name QWenBlock \
+    --dataset codefuse-python-en \
+    --sft_type full \
+    --output_dir output \
+    --num_train_epochs 1 \
+    --max_length 1024 \
+    --batch_size 1 \
+    --use_flash_attn true \
+    --gradient_accumulation_steps 1 \
+    --gradient_checkpointing no \
+    --tuner_backend 'peft' \
+    --eval_steps 200 \
+    --save_steps 200 \
+    --logging_steps 100 \
+    --report_to 'none'
@@ -0,0 +1,29 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+    --model_type qwen-72b-chat \
+    --model_layer_cls_name QWenBlock \
+    --dataset codefuse-python-en \
+    --sft_type lora \
+    --output_dir output_qwen_72b \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --batch_size 6 \
+    --use_flash_attn true \
+    --gradient_accumulation_steps 1 \
+    --gradient_checkpointing no \
+    --tuner_backend 'peft' \
+    --eval_steps 200 \
+    --save_steps 200 \
+    --logging_steps 100 \
+    --report_to 'none' \
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+
+def ta_accelerate(model,
+                  fsdp_num,
+                  layer_cls_name,
+                  bf16=True,
+                  fp16=False,
+                  gradient_checkpointing=True,
+                  fsdp_flatten_parameters=False):
+    """ accelerate LLM training using TorchAcc(only available internally).
+    """
+    import torchacc as ta
+    assert layer_cls_name is not None
+
+    def get_ta_config():
+        config = ta.Config()
+        config.compute.fp16 = fp16
+        config.compute.bf16 = bf16
+
+        config.memory.gc = gradient_checkpointing
+        if config.memory.gc:
+            config.memory.gc_cls = {layer_cls_name}
+
+        config.dist.fsdp.size = fsdp_num
+        config.dist.fsdp.wrap_layer_cls = {layer_cls_name}
+        config.dist.fsdp.flatten_parameters = fsdp_flatten_parameters
+
+        return config
+
+    ta_config = get_ta_config()
+    model = ta.accelerate(model, ta_config)
+    return model
@@ -12,11 +12,13 @@
 from transformers.utils import is_torch_npu_available
 
 from swift.trainers import Seq2SeqTrainer
+from swift.trainers.utils import can_return_loss, find_labels
 from swift.utils import (check_json_format, compute_acc_metrics,
                          compute_nlg_metrics, get_dist_setting, get_logger,
                          get_main, get_model_info, is_ddp_plus_mp, is_dist,
                          is_master, plot_images, preprocess_logits_for_metrics,
-                         seed_everything, show_layers)
+                         seed_everything, show_layers, use_torchacc)
+from .accelerator import ta_accelerate
 from .tuner import prepare_model
 from .utils import (TEMPLATE_MAPPING, LazyLLMDataset, SftArguments, Template,
                     add_self_cognition_dataset, dataset_map, get_dataset,
@@ -55,15 +57,15 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         model_kwargs = {'low_cpu_mem_usage': True}
         if is_dist() and not is_ddp_plus_mp():
             model_kwargs['device_map'] = {'': local_rank}
-        else:
+        elif not use_torchacc():
             model_kwargs['device_map'] = 'auto'
+
     if args.load_in_8bit or args.load_in_4bit:
         quantization_config = BitsAndBytesConfig(
             args.load_in_8bit,
             args.load_in_4bit,
             bnb_4bit_compute_dtype=args.bnb_4bit_compute_dtype,
             bnb_4bit_quant_type=args.bnb_4bit_quant_type,
-            bnb_4bit_quant_storage=args.bnb_4bit_quant_storage,
             bnb_4bit_use_double_quant=args.bnb_4bit_use_double_quant)
         logger.info(f'quantization_config: {quantization_config.__dict__}')
         model_kwargs['quantization_config'] = quantization_config
@@ -93,6 +95,13 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     set_generation_config(model, generation_config)
     training_args.generation_config = generation_config
 
+    if use_torchacc():
+        import torchacc as ta
+        # Get `label` and `return_loss` before 'ta_accelerate' because it will
+        # wrapper the model and make these properties wrong.
+        label_names = find_labels(model)
+        return_loss = can_return_loss(model)
+        model = ta.patch_qwen_model(model)
     # Preparing LoRA
     model, callbacks = prepare_model(model, args)
 
@@ -108,6 +117,18 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         logger.info('Setting model.config.use_cache: False')
         model.enable_input_require_grads()
 
+    if use_torchacc():
+        model.config.use_cache = False
+        logger.info('Setting model.config.use_cache: False')
+        model = ta_accelerate(
+            model,
+            world_size,
+            args.model_layer_cls_name,
+            args.bf16,
+            args.fp16,
+            gradient_checkpointing=True,
+            fsdp_flatten_parameters=False)
+
     # Loading Dataset
     random_state = np.random.RandomState(args.dataset_seed)
     train_dataset, val_dataset = get_dataset(
@@ -185,6 +206,15 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     padding_to = args.max_length if args.sft_type == 'longlora' else None
     data_collator = partial(template.data_collator, padding_to=padding_to)
 
+    trian_batch_size = args.batch_size
+    eval_batch_size = args.eval_batch_size
+    if use_torchacc():
+        trian_batch_size *= world_size
+        eval_batch_size *= world_size
+    training_args.per_device_train_batch_size = trian_batch_size
+    training_args.per_device_eval_batch_size = eval_batch_size
+    training_args.group_by_length = use_torchacc()
+
     # Trainer
     logger.info(f'training_args: {training_args}')
 
@@ -211,6 +241,9 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         callbacks=callbacks,
         **trainer_kwargs)
     trainer.sft_args = args
+    if use_torchacc():
+        trainer.label_names = label_names
+        trainer.can_return_loss = return_loss
     if is_master():
         for args_obj, fname in zip([args, training_args],
                                    ['sft_args.json', 'training_args.json']):
@@ -233,7 +266,7 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         f'best_model_checkpoint: {trainer.state.best_model_checkpoint}')
     train_time = get_time_info(trainer.state.log_history, len(train_dataset))
     # Visualization
-    if is_master():
+    if is_master() and not use_torchacc():
         images_dir = os.path.join(args.output_dir, 'images')
         logger.info(f'images_dir: {images_dir}')
         plot_images(images_dir, args.logging_dir, ['train/loss'], 0.9)
@@ -253,4 +286,14 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     }
 
 
-sft_main = get_main(SftArguments, llm_sft)
+def get_sft_main(args, llm):
+    if use_torchacc():
+        logger.warning('TorchAcc is currently only available internally '
+                       'within Alibaba Cloud.')
+        import torchacc as ta
+        # This patch should be called before `llm_sft`.
+        ta.accelerate_hf_trainer()
+    return get_main(args, llm)
+
+
+sft_main = get_sft_main(SftArguments, llm_sft)
@@ -1,18 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os
 import types
 
 import torch
 import transformers
 from packaging import version
 
+from swift.torchacc_utils import consolidate_checkpoint
 from swift.trainers import TrainerCallback
 from swift.tuners import (AdaLoraConfig, IA3Config, LongLoRAConfig,
                           LongLoRAModelType, LoraConfig, LoRAConfig,
                           NEFTuneConfig, Swift)
 from swift.tuners.llamapro import LLaMAProConfig
 from swift.tuners.module_mapping import MODEL_KEYS_MAPPING
 from swift.utils import (activate_model_parameters, freeze_model_parameters,
-                         get_logger)
+                         get_logger, use_torchacc)
 from .utils import (SftArguments, find_all_linears, find_embedding, find_ln,
                     is_adapter)
 
@@ -149,6 +151,9 @@ def prepare_model(model, args: SftArguments):
                 model = Swift.prepare_model(model, llamapro_config)
                 logger.info(f'llamapro_config: {llamapro_config}')
         else:
+            if use_torchacc():
+                consolidate_checkpoint(args.resume_from_checkpoint,
+                                       'adapter_model')
             model = Swift.from_pretrained(
                 model, args.resume_from_checkpoint, is_trainable=True)
         # fix bug: Attempting to unscale FP16 gradients.
@@ -168,6 +173,14 @@ def prepare_model(model, args: SftArguments):
         if len(args.additional_trainable_parameters) > 0:
             activate_model_parameters(model,
                                       args.additional_trainable_parameters)
+        if use_torchacc() and args.resume_from_checkpoint is not None:
+            consolidate_checkpoint(args.resume_from_checkpoint, 'model')
+            weights_file = os.path.join(args.resume_from_checkpoint,
+                                        'model.bin')
+            state_dict = torch.load(weights_file, map_location='cpu')
+            model.load_state_dict(state_dict, False)
+            # release memory
+            del state_dict
     else:
         raise ValueError(f'args.sft_type: {args.sft_type}')
 
 
@@ -49,6 +49,12 @@ class SftArguments:
         metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
     model_id_or_path: Optional[str] = None
     model_revision: Optional[str] = None
+    model_layer_cls_name: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            "Decoder Class name of model, e.g. 'QWenBlock' for QWen, 'LlamaDecoderLayer' for LLama"
+        })
 
     sft_type: Literal['lora', 'full', 'longlora', 'qalora', 'adalora', 'ia3',
                       'llamapro'] = 'lora'
 
@@ -26,7 +26,7 @@
 from transformers.utils.versions import require_version
 
 from swift import get_logger
-from swift.utils import is_dist, is_local_master
+from swift.utils import is_dist, is_local_master, use_torchacc
 from .template import TemplateType
 from .utils import get_max_model_len
 
@@ -2952,7 +2952,7 @@ def get_model_tokenizer(
     get_function = model_info['get_function']
     if model_kwargs is None:
         model_kwargs = {}
-    if 'device_map' not in model_kwargs:
+    if 'device_map' not in model_kwargs and not use_torchacc():
         model_kwargs['device_map'] = 'auto'
 
     if model_info.get('torch_dtype') is not None:
 
@@ -11,6 +11,8 @@
 from transformers import PreTrainedTokenizerBase, StoppingCriteria
 
 from swift.llm.agent.utils import calculate_loss_scale
+from swift.torchacc_utils import pad_and_split_batch
+from swift.utils import get_dist_setting, use_torchacc
 
 DEFAULT_SYSTEM = 'You are a helpful assistant.'
 History = List[Union[Tuple[str, str], List[str]]]
@@ -429,12 +431,18 @@ def data_collator(self,
                 loss_scale, batch_first=True, padding_value=0.)
         labels = pad_sequence(labels, batch_first=True, padding_value=-100)
 
+        if use_torchacc():
+            rank, _, world_size, _ = get_dist_setting()
+            input_ids, attention_mask, labels, loss_scale = pad_and_split_batch(
+                padding_to, input_ids, attention_mask, labels, loss_scale,
+                self.max_length, self.tokenizer, rank, world_size)
+
         res = {
             'input_ids': input_ids,
             'attention_mask': attention_mask,
             'labels': labels,
         }
-        if loss_scale is not None:
+        if loss_scale:
             res['loss_scale'] = loss_scale
         return res
 
 
@@ -5,6 +5,7 @@
 import logging
 import os
 import shutil
+import sys
 from copy import deepcopy
 from functools import partial, wraps
 from queue import Empty, Queue
@@ -40,7 +41,8 @@
 from swift.hub import ModelScopeConfig
 from swift.tuners.module_mapping import MODEL_KEYS_MAPPING
 from swift.utils import (get_dist_setting, get_logger, is_ddp_plus_mp, is_dist,
-                         is_local_master, is_master, stat_array, upper_bound)
+                         is_local_master, is_master, stat_array, upper_bound,
+                         use_torchacc)
 from .template import History, StopWords, StopWordsCriteria, Template
 
 logger = get_logger()
@@ -868,6 +870,8 @@ def get_max_model_len(config: PretrainedConfig) -> Optional[int]:
         _old_ddp_init(self, model, *args, **kwargs))
     transformers.modeling_utils.get_balanced_memory = lambda *args, **kwargs: None
     transformers.modeling_utils.infer_auto_device_map = _infer_auto_device_map_patch
+
+if is_ddp_plus_mp() or use_torchacc():
     _old_accelerator_init = trainer.Accelerator.__init__
     trainer.Accelerator.__init__ = (
         lambda self, device_placement=False, *args, **kwargs: