support ep for fsdp

zigzagcai · zigzagcai · commit bea38df1d817 · 2025-04-03T12:27:09.000+08:00
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
@@ -22,24 +22,15 @@
 CHECKPOINT_EVERY = 50
 ckpt = dict(
     enable_save_ckpt=False,  # enable ckpt save.
-    enable_internevo2hf_ckpt=False,  # enable ckpt save for huggingface format.
     save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
-    # load function such as "llama"
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"),
     # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
     # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
     # with an automatic restart mechanism upon training reboot.
     # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
     # path specified in `load_ckpt_info` by default.
     # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
     # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
+    auto_resume=False,
     checkpoint_every=CHECKPOINT_EVERY,
     async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
     async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
@@ -144,14 +135,12 @@
 model = dict(
     checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
     num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
     embed_grad_scale=1,
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
     dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
diff --git a/internlm/checkpoint/checkpoint_manager.py b/internlm/checkpoint/checkpoint_manager.py
@@ -582,7 +582,7 @@ def try_resume_training(self, train_state: TrainState, current_time=""):
                     f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
                     f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
                 )
-        elif is_using_fsdp() and is_using_hf() and not self.auto_resume:
+        elif is_using_fsdp() and not self.auto_resume:
             pass
         else:
             load_path = self.load_ckpt_info["path"]
diff --git a/internlm/core/fsdp.py b/internlm/core/fsdp.py
@@ -33,8 +33,10 @@
     FSDP2_SUPPORTED = False
 
 try:
+    import torch.distributed.checkpoint as dcp
     from torch.distributed.checkpoint.state_dict import (
         StateDictOptions,
+        get_model_state_dict,
         set_model_state_dict,
     )
 
@@ -163,8 +165,29 @@ def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
         )
         fsdp_mode = gpc.config.parallel.fsdp.get("mode", "v1")
         fsdp_init_method = gpc.config.parallel.fsdp.get("init_method", "cuda")
+        if gpc.is_using_parallel_mode(ParallelMode.EXPERT):
+            assert gpc.get_world_size(ParallelMode.EXPERT_DATA) * gpc.get_world_size(ParallelMode.EXPERT) == gpc.get_world_size(ParallelMode.GLOBAL)
 
         if fsdp_mode == "v1":
+            ignored_mod = []
+            if gpc.is_using_parallel_mode(ParallelMode.EXPERT):
+                for layer_id, layer in enumerate(model.model.layers):
+                    if layer_id >= gpc.config.model.first_k_dense_replace:
+                        # Should follow this modeling pattern if EP is enabled.
+                        # Change the expert module name if needed.
+                        # TODO: Make this part hard-coded or config-driven?
+                        layer.feed_forward.moe_layer.experts = FSDP(
+                            layer.feed_forward.moe_layer.experts, 
+                            process_group=gpc.get_group(ParallelMode.EXPERT_DATA),
+                            sharding_strategy=ShardingStrategy.FULL_SHARD, 
+                            sync_module_states=fsdp_init_method != "cuda",  # sync model paramters
+                            forward_prefetch=True,
+                            backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
+                            limit_all_gathers=True,
+                            use_orig_params=True,
+                            device_id=None if fsdp_init_method == "cuda" else get_current_device(),  # needed for sync_module_states
+                        )
+                        ignored_mod.append(layer.feed_forward.moe_layer.experts)
             model = FSDP(
                 module=model,
                 process_group=gpc.get_group(ParallelMode.GLOBAL),
@@ -176,6 +199,7 @@ def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
                 limit_all_gathers=True,
                 use_orig_params=True,
                 device_id=None if fsdp_init_method == "cuda" else get_current_device(),  # needed for sync_module_states
+                ignored_modules=ignored_mod,
             )
             # For FSDP v1, to get ckpt resuming work normally, we do dummy forward.
             # This hack is needed due to FSDP v1 lazy initialization in model construction.
@@ -196,7 +220,7 @@ def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
         else:
             raise ValueError(f"Unsupported FSDP mode: {fsdp_mode}")
 
-        if is_using_hf() and not gpc.config.ckpt.get("auto_resume", False):
+        if not gpc.config.ckpt.get("auto_resume", False):
             load_ckpt_info = gpc.config.ckpt.load_ckpt_info
             load_ckpt_path = load_ckpt_info.get("path", None)
             load_ckpt_content = load_ckpt_info.get("content", [])
@@ -205,16 +229,22 @@ def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
                     "model",
                 ), "If auto_resume=False and checkpoint path is given, only model can be loaded"
                 if DCP_SUPPORTED:
-                    hf = gpc.config.hf
-                    mod = LazyObject(hf.mod, hf.mod_cls)
-                    mod = mod.build()
-                    state_dict = mod.from_pretrained(
-                        pretrained_model_name_or_path=load_ckpt_path, use_safetensors=True
-                    ).state_dict()
-                    state_dict = {f"model.{key}": state_dict[key].clone().detach() for key in state_dict}
-                    set_model_state_dict(
-                        model=model, model_state_dict=state_dict, options=StateDictOptions(full_state_dict=True)
-                    )
+                    if is_using_hf():
+                        hf = gpc.config.hf
+                        mod = LazyObject(hf.mod, hf.mod_cls)
+                        mod = mod.build()
+                        state_dict = mod.from_pretrained(
+                            pretrained_model_name_or_path=load_ckpt_path, use_safetensors=True
+                        ).state_dict()
+                        state_dict = {f"model.{key}": state_dict[key].clone().detach() for key in state_dict}
+                        set_model_state_dict(
+                            model=model, model_state_dict=state_dict, options=StateDictOptions(full_state_dict=True)
+                        )
+                    else:
+                        state_dict = get_model_state_dict(model=model)
+                        state_dict = {key: state_dict[key].clone().detach() for key in state_dict}
+                        dcp.load(state_dict=state_dict, checkpoint_id=load_ckpt_path)
+                        set_model_state_dict(model=model, model_state_dict=state_dict)
                     del state_dict
                     internlm_accelerator.empty_cache()
                 else:
diff --git a/internlm/initialize/initialize_launcher.py b/internlm/initialize/initialize_launcher.py
@@ -57,6 +57,8 @@ def dispatch_hf_config_before_launch(hf: dict) -> None:
         gpc.config.model.num_experts = model_config.num_experts
     elif hasattr(model_config, "n_routed_experts"):
         gpc.config.model.num_experts = model_config.n_routed_experts
+    if hasattr(model_config, "first_k_dense_replace"):
+        gpc.config.model.first_k_dense_replace = model_config.first_k_dense_replace
 
 
 def args_sanity_check():
@@ -306,8 +308,9 @@ def args_sanity_check():
         logger.info(f"clip_grad_norm: {clip_grad_norm}")
 
     model = gpc.config.model
-    if "enable_qkv_fusion" not in model:
-        model._add_item("enable_qkv_fusion", True)
+    # TODO: should we set default value for enable_qkv_fusion?
+    # if "enable_qkv_fusion" not in model:
+    #     model._add_item("enable_qkv_fusion", True)
 
     if "dtype" not in model:
         logger.warning("dtype is not set, use torch.float16 by defalut!")
diff --git a/internlm/initialize/initialize_optimizer.py b/internlm/initialize/initialize_optimizer.py
@@ -50,7 +50,7 @@ def split_params_into_different_groups_for_optimizer(
 
     if is_using_fsdp():
         optimizer_mode = ParallelMode.GLOBAL
-        optimizer_mode_expert = ParallelMode.GLOBAL
+        optimizer_mode_expert = ParallelMode.EXPERT_DATA
     else:
         optimizer_mode = ParallelMode.ZERO1
         optimizer_mode_expert = ParallelMode.EXPERT_DATA
diff --git a/internlm/model/model_ops/ops/cross_entropy.py b/internlm/model/model_ops/ops/cross_entropy.py
@@ -18,6 +18,7 @@
     CrossEntropyApexVocabParallel,
     CrossEntropyLossApex,
     CrossEntropyPython,
+    CrossEntropyLossFlash,
 )
 from internlm.utils.logger import get_logger
 
@@ -86,17 +87,8 @@ def new_cross_entropy(
 
         assert gpc.get_group(ParallelMode.TENSOR) is not None, "The process group should not be None."
 
-        try:
-            from flash_attn.losses.cross_entropy import (
-                CrossEntropyLoss as FlashCrossEntropyLoss,
-            )
-
-            flash_cross_entropy_impl = True
-        except (ModuleNotFoundError, ImportError):
-            flash_cross_entropy_impl = False
-
         assert (
-            gpc.config.model.get("use_flash_attn", False) and flash_cross_entropy_impl
+            gpc.config.model.get("use_flash_attn", False)
         ), "Only flash cross entropy support parallel_output"
 
         assert (
@@ -108,7 +100,7 @@ def new_cross_entropy(
             which may result loss divergency in long sequence."
         )
 
-        return FlashCrossEntropyLoss(
+        return CrossEntropyLossFlash(
             ignore_index=ignore_index,
             reduction=reduction,
             label_smoothing=label_smoothing,
diff --git a/internlm/model/model_ops/ops/cross_entropy_ops/__init__.py b/internlm/model/model_ops/ops/cross_entropy_ops/__init__.py
@@ -2,10 +2,12 @@
 from .py_naive_loss import CrossEntropyPython
 from .py_vocab_parallel_loss import CrossEntropyApexVocabParallel
 from .sequence_parallel_loss import VocabSequenceParallelCrossEntropyLoss
+from .flash_loss import CrossEntropyLossFlash
 
 __all__ = [
     "CrossEntropyLossApex",
     "CrossEntropyPython",
     "CrossEntropyApexVocabParallel",
     "VocabSequenceParallelCrossEntropyLoss",
+    "CrossEntropyLossFlash",
 ]
diff --git a/internlm/model/model_ops/ops/cross_entropy_ops/flash_loss.py b/internlm/model/model_ops/ops/cross_entropy_ops/flash_loss.py
diff --git a/internlm/solver/optimizer/fsdp_optimizer.py b/internlm/solver/optimizer/fsdp_optimizer.py
diff --git a/train.py b/train.py

Original file line number	Diff line number	Diff line change
`@@ -582,7 +582,7 @@ def try_resume_training(self, train_state: TrainState, current_time=""):`
`582`	`582`	`f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"`
`583`	`583`	`f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="`
`584`	`584`	`)`
`585`		`- elif is_using_fsdp() and is_using_hf() and not self.auto_resume:`
	`585`	`+ elif is_using_fsdp() and not self.auto_resume:`
`586`	`586`	`pass`
`587`	`587`	`else:`
`588`	`588`	`load_path = self.load_ckpt_info["path"]`