diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index 52b9baa44d..16064f04af 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -248,6 +248,7 @@ gradient_checkpointing: true - 🔥neftune_noise_alpha: neftune添加的噪声系数。默认为0,通常可以设置为5、10、15。 - 🔥use_liger_kernel: 是否启用[Liger](https://github.com/linkedin/Liger-Kernel)内核加速训练并减少显存消耗。默认为False。示例shell参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger)。 - 注意:liger_kernel不支持device_map,请使用DDP/DeepSpeed进行多卡训练。liger_kernel目前只支持`task_type='causal_lm'`。 +- use_cce: 是否启用[cut-cross-entropy](https://github.com/apple/ml-cross-entropy)融合算子降低显存并加速训练。默认为False。示例shell参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/cce)。 - average_tokens_across_devices: 是否在设备之间进行token数平均。如果设置为True,将使用all_reduce同步`num_tokens_in_batch`以进行精确的损失计算。默认为False。 - max_grad_norm: 梯度裁剪。默认为1.。 - 注意:日志中的grad_norm记录的是裁剪前的值。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 52176fe193..44b571d959 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -250,6 +250,7 @@ Other important parameters: - 🔥neftune_noise_alpha: Noise magnitude for NEFTune. Default is 0. Common values: 5, 10, 15. - 🔥use_liger_kernel: Whether to enable the [Liger](https://github.com/linkedin/Liger-Kernel) kernel to accelerate training and reduce GPU memory consumption. Defaults to False. Example shell script can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger). - Note: Liger kernel does not support `device_map`. Use DDP or DeepSpeed for multi-GPU training. Currently, liger_kernel only supports `task_type='causal_lm'`. +- use_cce: Whether to enable the [cut-cross-entropy](https://github.com/apple/ml-cross-entropy) fused operator to reduce GPU memory usage and accelerate training. Defaults to `False`. Example shell script can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/cce). - average_tokens_across_devices: Whether to average token counts across devices. If `True`, `num_tokens_in_batch` is synchronized via `all_reduce` for accurate loss computation. Default is `False`. - max_grad_norm: Gradient clipping. Default is 1. - Note: The logged `grad_norm` reflects the value **before** clipping. diff --git a/examples/train/cce/sft.sh b/examples/train/cce/sft.sh new file mode 100644 index 0000000000..5c34b74db3 --- /dev/null +++ b/examples/train/cce/sft.sh @@ -0,0 +1,17 @@ +# test env: 1 * A10 +# Using use_cce: 2.62GB +# Not using use_cce: 16.24G + +# Install CCE dependency +pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f643b88" + +# Run ms-swift (example) +swift sft \ + --model Qwen/Qwen2.5-0.5B-Instruct \ + --dataset gsm8k#1024 \ + --train_type lora \ + --per_device_train_batch_size 64 \ + --per_device_eval_batch_size 64 \ + --use_hf true \ + --use_cce true \ + "$@" diff --git a/swift/llm/train/tuner.py b/swift/llm/train/tuner.py index 286a9f4b04..e18c8af3aa 100644 --- a/swift/llm/train/tuner.py +++ b/swift/llm/train/tuner.py @@ -86,6 +86,76 @@ def apply_liger(model_type: str): 'by running `pip install -U liger-kernel`') +def apply_cce(model_type: str): + try: + from cut_cross_entropy.transformers import cce_patch + from swift.llm import ModelType + except ImportError: + raise ImportError('Please upgrade cut-cross-entropy to apply cce kernels to this model ' + 'by running `pip install "cut-cross-entropy[transformers] @ ' + 'git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f643b88"`') + + model_type_map = { + # llama family + ModelType.llama: 'llama', + ModelType.llama3: 'llama', + ModelType.llama3_1: 'llama', + ModelType.llama3_2: 'llama', + ModelType.llama4: 'llama4', + ModelType.llama3_2_vision: 'mllama', + # mistral & mixtral family + ModelType.mistral: 'mistral', + ModelType.mixtral: 'mixtral', + # phi + ModelType.phi3: 'phi3', + # gemma family + ModelType.gemma: 'gemma', + ModelType.gemma2: 'gemma2', + ModelType.gemma3_text: 'gemma3_text', + ModelType.gemma3_vision: 'gemma3', + ModelType.gemma3n: 'gemma3n', + # glm4 family + ModelType.glm4: 'glm4', + ModelType.glm4_0414: 'glm4', + ModelType.glm4_5: 'glm4_moe', + ModelType.glm4_z1_rumination: 'glm4_moe', + ModelType.glm4v: 'glm4v', + ModelType.glm4_1v: 'glm4v', + ModelType.glm4_5v: 'glm4v_moe', + # llava + ModelType.llava1_5_hf: 'llava', + ModelType.llava_llama3_hf: 'llava', + # qwen2 family + ModelType.qwen2: 'qwen2', + ModelType.qwen2_5: 'qwen2', + ModelType.qwen2_vl: 'qwen2_vl', + ModelType.qwen2_5_vl: 'qwen2_5_vl', + # qwen3 family + ModelType.qwen3: 'qwen3', + ModelType.qwen3_guard: 'qwen3', + ModelType.qwen3_thinking: 'qwen3', + ModelType.qwen3_nothinking: 'qwen3', + ModelType.qwen3_coder: 'qwen3', + ModelType.qwen3_moe: 'qwen3_moe', + ModelType.qwen3_moe_thinking: 'qwen3_moe', + ModelType.qwen3_next: 'qwen3_next', + ModelType.qwen3_next_thinking: 'qwen3_next', + ModelType.qwen3_vl: 'qwen3_vl', + ModelType.qwen3_moe_vl: 'qwen3_vl_moe', + # deepseek family + ModelType.deepseek_r1: 'deepseek_v3', + ModelType.deepseek_v3_1: 'deepseek_v3', + } + + cce_model_type = model_type_map.get(model_type) + if cce_model_type: + cce_patch(cce_model_type) + return + + supported_models = ', '.join(sorted(set(model_type_map.values()))) + raise ValueError(f'Unsupported cce model_type: {model_type}. Supported types: {supported_models}') + + def get_multimodal_target_regex( model, *, @@ -375,6 +445,9 @@ def prepare_model(cls, args, model, *, template=None, train_dataset=None, task_t # Apply liger apply_liger(args.model_type) + if args.use_cce and 'use_cce' not in inspect.signature(TrainingArguments).parameters: + apply_cce(args.model_type) + if args.is_adapter: if args.tuner_backend != 'unsloth' and args.train_type not in extra_tuners: # Fix the name of the layer in xcomposer that contains Plora. diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py index f356439bd8..a8f69072c0 100644 --- a/swift/trainers/arguments.py +++ b/swift/trainers/arguments.py @@ -9,7 +9,7 @@ from transformers.training_args_seq2seq import Seq2SeqTrainingArguments as HfSeq2SeqTrainingArguments from swift.plugin import loss_mapping -from swift.utils import get_dist_setting, get_logger, is_liger_available, is_mp, json_parse_to_dict +from swift.utils import get_dist_setting, get_logger, is_cce_available, is_liger_available, is_mp, json_parse_to_dict from .optimizers.galore import GaLoreConfig logger = get_logger() @@ -53,6 +53,7 @@ class TrainArgumentsMixin: dataloader_prefetch_factor (Optional[int]): The number of batches loaded in advance by each worker. Defaults to None. use_liger_kernel (bool): Whether to use the Liger kernel for optimization. Defaults to False. + use_cce (bool): Whether to use ml-cross-entropy fused kernels for optimization. Defaults to False. check_model (bool): If True, checks local model files for corruption or modification and provides a warning. Should be set to False in an offline environment. Defaults to True. acc_strategy (Literal['token', 'seq']): The strategy for calculating accuracy during training and validation. @@ -115,6 +116,7 @@ class TrainArgumentsMixin: dataloader_persistent_workers: bool = False dataloader_prefetch_factor: Optional[int] = None use_liger_kernel: bool = False + use_cce: bool = False # extra check_model: bool = True @@ -163,11 +165,20 @@ def _init_liger(self): except Exception: pass + def _init_cce(self): + if self.use_cce: + assert is_cce_available(), ('use_cce requires cut-cross-entropy, try ' + '`pip install "cut-cross-entropy[transformers] @ ' + 'git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f643b88"`') + def __post_init__(self): if is_mp() and self.use_liger_kernel: raise ValueError('liger_kernel does not support device_map. ' 'Please use DDP/DeepSpeed for multi-GPU training.') + if self.use_cce and self.use_liger_kernel: + logger.warning('Enabling both use_cce and use_liger_kernel may lead to duplicated kernel patches.') + if self.optimizer is None and (self.vit_lr is not None or self.aligner_lr is not None): self.optimizer = 'multimodal' if self.gradient_accumulation_steps is None: @@ -181,6 +192,7 @@ def __post_init__(self): if self.gradient_checkpointing_kwargs: self.gradient_checkpointing_kwargs = json_parse_to_dict(self.gradient_checkpointing_kwargs) self._init_liger() + self._init_cce() if self.dataloader_num_workers is None: if platform.system() == 'Windows': self.dataloader_num_workers = 0 diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 74cd9ad481..b4d346171e 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -318,9 +318,9 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N if (self.label_smoother is not None or compute_loss_func is not None or loss_scale is not None or self.args.enable_dft_loss or self.args.enable_channel_loss or self.template.sequence_parallel_size > 1) and 'labels' in inputs: - if self.args.use_liger_kernel: - logger.warning_once('The cross_entropy loss function defined in Liger Kernel will not ' - 'take effect, potentially leading to increased GPU memory consumption.') + if self.args.use_liger_kernel or getattr(self.args, 'use_cce', False): + logger.warning_once('The cross_entropy loss function defined in Liger Kernel or ml-cross-entropy will ' + 'not take effect, potentially leading to increased GPU memory consumption.') labels = inputs.pop('labels') outputs = model(**inputs) if getattr(outputs, 'aux_loss', None) is not None: diff --git a/swift/ui/llm_grpo/llm_grpo.py b/swift/ui/llm_grpo/llm_grpo.py index 0cd13462fe..ff6256e4ee 100644 --- a/swift/ui/llm_grpo/llm_grpo.py +++ b/swift/ui/llm_grpo/llm_grpo.py @@ -157,6 +157,16 @@ class LLMGRPO(LLMTrain): 'en': 'Liger kernel can reduce memory usage' } }, + 'use_cce': { + 'label': { + 'zh': '使用CCE加速', + 'en': 'Use CCE acceleration' + }, + 'info': { + 'zh': 'CCE (ml-cross-entropy) 提供融合的交叉熵算子', + 'en': 'CCE (ml-cross-entropy) provides fused cross-entropy kernels' + } + }, 'sequence_parallel_size': { 'label': { 'zh': '序列并行大小', @@ -233,6 +243,7 @@ def do_build_ui(cls, base_tab: Type['BaseUI']): gr.Textbox(elem_id='seed', scale=4) gr.Dropdown(elem_id='torch_dtype', scale=4) gr.Checkbox(elem_id='use_liger_kernel', scale=4) + gr.Checkbox(elem_id='use_cce', scale=4) gr.Textbox(elem_id='sequence_parallel_size', lines=1, scale=4) with gr.Row(): gr.Dropdown( diff --git a/swift/ui/llm_rlhf/llm_rlhf.py b/swift/ui/llm_rlhf/llm_rlhf.py index d7f1f740c7..1b8c028fce 100644 --- a/swift/ui/llm_rlhf/llm_rlhf.py +++ b/swift/ui/llm_rlhf/llm_rlhf.py @@ -170,6 +170,16 @@ class LLMRLHF(LLMTrain): 'en': 'Liger kernel can reduce memory usage' } }, + 'use_cce': { + 'label': { + 'zh': '使用CCE加速', + 'en': 'Use CCE acceleration' + }, + 'info': { + 'zh': 'CCE (ml-cross-entropy) 提供融合的交叉熵算子', + 'en': 'CCE (ml-cross-entropy) provides fused cross-entropy kernels' + } + }, 'sequence_parallel_size': { 'label': { 'zh': '序列并行大小', @@ -246,6 +256,7 @@ def do_build_ui(cls, base_tab: Type['BaseUI']): gr.Textbox(elem_id='seed', scale=2) gr.Dropdown(elem_id='torch_dtype', scale=2) gr.Checkbox(elem_id='use_liger_kernel', scale=2) + gr.Checkbox(elem_id='use_cce', scale=2) with gr.Row(): gr.Dropdown( elem_id='gpu_id', diff --git a/swift/ui/llm_train/llm_train.py b/swift/ui/llm_train/llm_train.py index b1b2b94f6d..a6c1c7dcce 100644 --- a/swift/ui/llm_train/llm_train.py +++ b/swift/ui/llm_train/llm_train.py @@ -177,6 +177,16 @@ class LLMTrain(BaseUI): 'en': 'Liger kernel can reduce memory usage' } }, + 'use_cce': { + 'label': { + 'zh': '使用CCE加速', + 'en': 'Use CCE acceleration' + }, + 'info': { + 'zh': 'CCE (ml-cross-entropy) 提供融合的交叉熵算子', + 'en': 'CCE (ml-cross-entropy) provides fused cross-entropy kernels' + } + }, 'sequence_parallel_size': { 'label': { 'zh': '序列并行大小', @@ -257,6 +267,7 @@ def do_build_ui(cls, base_tab: Type['BaseUI']): gr.Textbox(elem_id='seed', scale=4) gr.Dropdown(elem_id='torch_dtype', scale=4) gr.Checkbox(elem_id='use_liger_kernel', scale=4) + gr.Checkbox(elem_id='use_cce', scale=4) with gr.Row(): gr.Dropdown( elem_id='gpu_id', @@ -390,6 +401,9 @@ def train(cls, *args): use_liger_kernel = kwargs.get('use_liger_kernel', None) if use_liger_kernel: kwargs.pop('use_liger_kernel') + use_cce = kwargs.get('use_cce', None) + if use_cce: + kwargs.pop('use_cce') if other_kwargs.get('use_muon'): kwargs['use_muon'] = other_kwargs.pop('use_muon') @@ -428,6 +442,9 @@ def train(cls, *args): if use_liger_kernel: params += f'--use_liger_kernel {cls.quote}{use_liger_kernel}{cls.quote} ' command.extend(['--use_liger_kernel', f'{use_liger_kernel}']) + if use_cce: + params += f'--use_cce {cls.quote}{use_cce}{cls.quote} ' + command.extend(['--use_cce', f'{use_cce}']) if use_muon: params += f'--optimizer {cls.quote}muon{cls.quote} ' command.extend(['--optimizer', 'muon']) diff --git a/swift/utils/__init__.py b/swift/utils/__init__.py index 4d9216bb70..e2ff7b4ab4 100644 --- a/swift/utils/__init__.py +++ b/swift/utils/__init__.py @@ -2,7 +2,7 @@ from .env import (get_dist_setting, get_hf_endpoint, get_node_setting, get_pai_tensorboard_dir, is_deepspeed_enabled, is_dist, is_last_rank, is_local_master, is_master, is_mp, is_mp_ddp, is_pai_training_job, use_hf_hub) -from .import_utils import (is_flash_attn_2_available, is_flash_attn_3_available, is_liger_available, +from .import_utils import (is_cce_available, is_flash_attn_2_available, is_flash_attn_3_available, is_liger_available, is_lmdeploy_available, is_megatron_available, is_swanlab_available, is_trl_available, is_unsloth_available, is_vllm_ascend_available, is_vllm_available, is_wandb_available) from .io_utils import JsonlWriter, append_to_jsonl, download_ms_file, get_file_mm_type, read_from_jsonl, write_to_jsonl diff --git a/swift/utils/import_utils.py b/swift/utils/import_utils.py index 95e9ba1f47..fc46160877 100644 --- a/swift/utils/import_utils.py +++ b/swift/utils/import_utils.py @@ -28,6 +28,10 @@ def is_liger_available(): return importlib.util.find_spec('liger_kernel') is not None +def is_cce_available(): + return importlib.util.find_spec('cut_cross_entropy') is not None + + def is_swanlab_available(): return importlib.util.find_spec('swanlab') is not None diff --git a/tests/train/test_cce.py b/tests/train/test_cce.py new file mode 100644 index 0000000000..728aa2f76c --- /dev/null +++ b/tests/train/test_cce.py @@ -0,0 +1,29 @@ +import os + +# os.environ['CUDA_VISIBLE_DEVICES'] = '0' +# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' + +kwargs = { + 'per_device_train_batch_size': 64, + 'save_steps': 30, + 'gradient_accumulation_steps': 2, + 'num_train_epochs': 1, +} + + +def test_sft(): + from swift.llm import sft_main, TrainArguments, infer_main, InferArguments + result = sft_main( + TrainArguments( + model='Qwen/Qwen2.5-0.5B-Instruct', + dataset=['gsm8k#1024'], + split_dataset_ratio=0.01, + use_cce=True, + # use_liger_kernel=True, + **kwargs)) + last_model_checkpoint = result['last_model_checkpoint'] + infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True)) + + +if __name__ == '__main__': + test_sft()