best speed (#11023)

zhangbo9674 · web-flow · commit 3737315dde1b · 2025-08-28T14:47:23.000+08:00
diff --git a/llm/config/deepseek-v3/pretrain_argument.json b/llm/config/deepseek-v3/pretrain_argument.json
@@ -3,16 +3,17 @@
   "tokenizer_name_or_path": "deepseek-ai/DeepSeek-V3",
   "input_dir": "./data",
   "output_dir": "./checkpoints/pretrain_ckpts",
+  "resume_from_huggingface_ckpt": "./huggingface_ckpt/",
   "per_device_train_batch_size": 1,
   "gradient_accumulation_steps": 60,
   "per_device_eval_batch_size": 1,
   "tensor_parallel_degree": 1,
   "pipeline_parallel_degree": 8,
-  "pipeline_parallel_config": "use_dualpipev",
-  "sharding_parallel_degree": 64,
-  "sharding_parallel_config": "split_param enable_fuse_optimizer_states",
-  "sharding_comm_buffer_size_MB": 4096,
-  "expert_parallel_degree": 64,
+  "pipeline_parallel_config": "use_dualpipev enable_overlap_p2p_comm",
+  "sharding_parallel_degree": 32,
+  "sharding_parallel_config": "split_param",
+  "sharding_comm_buffer_size_MB": 2048,
+  "expert_parallel_degree": 32,
   "sharding": "stage1",
   "virtual_pp_degree": 1,
   "sequence_parallel": 0,
@@ -47,7 +48,7 @@
   "use_fused_rope": true,
   "save_sharded_model": false,
   "load_sharded_model": false,
-  "unified_checkpoint": true,
   "use_expert_parallel": true,
-  "unified_checkpoint_config": "skip_save_model_weight"
+  "unified_checkpoint_config": "skip_save_model_weight",
+  "offload_optim": true
 }
diff --git a/llm/model_config/DeepSeek-V3/config.json b/llm/model_config/DeepSeek-V3/config.json
@@ -59,15 +59,20 @@
     "v_head_dim": 128,
     "vocab_size": 129280,
     "using_flex_token": true,
-    "using_fake_gate": true,
+    "using_fake_gate": false,
     "use_fused_rms_norm": true,
     "fuse_attention_ffn": true,
     "use_fused_rope": true,
     "token_drop_steps": 0,
     "recompute_fwd_gate_up": true,
-    "adaptive_remained_O1_recompute_ratio": 2.0,
+    "adaptive_remained_O1_recompute_ratio": 0,
     "using_post_norm_recompute": true,
     "is_split_group_gemm": false,
     "use_dualpipev": true,
-    "send_mtp_embed": false
+    "send_mtp_embed": false,
+    "mlp_fwd_subbatch_rows": 0,
+    "mlp_bwd_subbatch_rows": 65536,    
+    "output_subbatch_rows": 2048,
+    "recompute_fa3": true,
+    "stepped_recompute_fwd_gate_up": true
   }
diff --git a/llm/script/train_gpu.sh b/llm/script/train_gpu.sh
@@ -47,13 +47,22 @@ fi
 export PYTHONPATH=../:$PYTHONPATH
 export CUDA_PATH=/usr/local/cuda-12.9
 
+# Flags for best performance
 export DSV3_USE_FP8_GEMM=true
 export DSV3_USE_ATTEN_RECOMPUTE=true
 export FA_VERSION=3
 export FLAGS_share_tensor_for_grad_tensor_holder=1
 export FLAGS_use_default_stream=false
 export DSV3_USE_FP8_DISPATCH=true
-export USE_DS_GEMM=false
+export USE_DS_GEMM=true
+
+# Flags for allocator
+export FLAGS_large_pool_auto_growth_chunk_size_in_mb=500
+export FLAGS_small_pool_auto_growth_chunk_size_in_mb=20
+export FLAGS_small_pool_size_in_mb=10
+export FLAGS_samll_pool_pre_alloc_in_mb=500
+export FLAGS_large_pool_pre_alloc_in_mb=61440
+export FLAGS_deep_ep_comm_prealloc_in_mb=1000
 
 
 bash script/kill_process.sh 
diff --git a/paddlenlp/trainer/utils/load_hf_ckpt.py b/paddlenlp/trainer/utils/load_hf_ckpt.py
@@ -207,7 +207,7 @@ def _get_hf_prefix(segment_id: int, id_in_segment: int) -> str:
     # special_cases = {(0, 0): "model", (28, 2): "model.layers.61", (28, 3): "model"}
     # special_cases = {(0, 0): "model", (28, 2): "model.layers.61", (4, 1): "model"}
     # special_cases = {(0, 0): "model",  (28, 2): "model", (28,3): "lm_head"}
-    special_cases = {(0, 0): "model", (60, 2): "model", (60, 3): "lm_head"}
+    special_cases = {(0, 0): "model", (60, 2): "model.layers.61", (60, 3): "model", (60, 4): "lm_head"}
 
     if (segment_id, id_in_segment) in special_cases:
         return special_cases[(segment_id, id_in_segment)]
diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
@@ -85,6 +85,7 @@
 from ..fp8_utils import (
     FP8KeepXLinear,
     FP8Linear,
+    FP8LinearFunction,
     FP8LinearFunctionBase,
     FP8Mlp,
     cache_fp8_weight,
@@ -1552,7 +1553,10 @@ def backward(ctx, dout):
         else:
             assert False, f"invalid {FA_VERSION=}"
 
-        if (FA_VERSION == 3 and not recompute_fa3) or FA_VERSION == 2:
+        if FA_VERSION == 2:
+            assert not recompute_fa3
+            assert attn_out is not None and softmax_lse is not None
+        if FA_VERSION == 3 and not recompute_fa3:
             assert attn_out is not None and softmax_lse is not None
 
         q_ln_t, q_ln_invar = fused_ln.fused_rms_norm(q_init, q_ln_weight, eps)
@@ -1636,25 +1640,25 @@ def backward(ctx, dout):
         elif FA_VERSION == 3:
             # recompute fa3
             if recompute_fa3:
-                logger.info("Enable fa3 recomputation")
-                attn_out, softmax_lse = _C_ops.flash_attn_v3(
-                    query_states,
-                    key_states,
-                    value_states,
-                    None,  # q_v_
-                    None,  # q_descale_
-                    None,  # k_descale_
-                    None,  # v_descale_
-                    softmax_scale,
-                    True,
-                    -1,  # window_size_left
-                    -1,  # window_size_right
-                    0.0,  # softcap
-                    1,  # num_splits
-                    False,  # manual_set_pack_gqa
-                    False,  # pack_gqa_
-                    0,  # sm_margin
-                )
+                with paddle.no_grad():
+                    attn_out, softmax_lse = _C_ops.flash_attn_v3(
+                        query_states,
+                        key_states,
+                        value_states,
+                        None,  # q_v_
+                        None,  # q_descale_
+                        None,  # k_descale_
+                        None,  # v_descale_
+                        softmax_scale,
+                        True,
+                        -1,  # window_size_left
+                        -1,  # window_size_right
+                        0.0,  # softcap
+                        1,  # num_splits
+                        False,  # manual_set_pack_gqa
+                        False,  # pack_gqa_
+                        0,  # sm_margin
+                    )
             with paddle.no_grad():
                 q_grad, k_grad, v_grad = _C_ops.flash_attn_v3_grad(
                     query_states,
@@ -2587,7 +2591,7 @@ def forward(
         nextn_hidden_state = self.enorm(nextn_hidden_state)
 
         concat_h = paddle.concat([nextn_hidden_state, hidden_states], axis=-1)
-        hidden_states = LMHeadFunction.apply(concat_h, self.eh_proj.weight, False)
+        hidden_states = FP8LinearFunction.apply(concat_h, self.eh_proj)
 
         layer_outputs = super(DeepseekV2MTPLayer, self).forward(
             hidden_states,
@@ -3180,11 +3184,8 @@ def forward(
 class FastCrossEntropyFunction(paddle.autograd.PyLayer):
     @staticmethod
     def forward(ctx, preds, labels):
-
         softmax_val, loss = paddle._C_ops.cross_entropy_with_softmax(preds, labels, False, True, False, -100, -1)
 
-        # print("softmax val", softmax_val.dtype)
-
         ctx.save_for_backward(labels, softmax_val)
         return loss
 
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -67,7 +67,7 @@
 )
 from paddlenlp.transformers.moe_layer import FusionMoeNode
 
-from ..fp8_utils import FP8LinearFunctionBase
+from ..fp8_utils import FP8LinearFunction, FP8LinearFunctionBase
 
 __all__ = [
     "DeepseekV2ForCausalLMPipe",
@@ -1204,11 +1204,12 @@ def forward_backward(self, inputs, output_grad, combine_bw_event_to_wait=None, p
                 combine_forward_event.current_stream_wait()
                 final_out_event.current_stream_wait()
 
-                if final_out.shape[-1] != combine_fwd_out.shape[-1]:
-                    final_out[:, :, : combine_fwd_out.shape[-1]] += combine_fwd_out  # 直接广播并相加
-                else:
-                    final_out += combine_fwd_out
-                inputs = final_out
+                # TODO: check correct
+                # if final_out.shape[-1] != combine_fwd_out.shape[-1]:
+                #     final_out[:, :, : combine_fwd_out.shape[-1]] += combine_fwd_out  # 直接广播并相加
+                # else:
+                #     final_out += combine_fwd_out
+                inputs = final_out + combine_fwd_out
 
                 final_out._record_stream()
                 combine_fwd_out._record_stream()
@@ -1400,9 +1401,7 @@ def build_overlapped_nodes(forward_chunk, backward_chunk):
     backward_pre_node = ScheduleChunk(list(reversed(backward_pre_overlap_layers)))
     backward_post_node = ScheduleChunk(list(reversed(backward_post_overlap_layers)))
 
-    if not forward_chunk.nodes and all(
-        isinstance(n, FusionFp8DecoderLayerNode) for n in backward_chunk.nodes
-    ):
+    if not forward_chunk.nodes and all(isinstance(n, FusionFp8DecoderLayerNode) for n in backward_chunk.nodes):
         backward_post_node = DecoderBackwardScheduleChunk(backward_post_overlap_layers)
 
     overlap_node = OverlapedScheduleChunk(forward_overlap_layers, backward_overlap_layers, use_fuion=DSV3_USE_FP8_GEMM)
@@ -1938,7 +1937,8 @@ def attn_compute_for_fusion(self, args):
         hidden_states = self.hnorm(hidden_states)
         nextn_hidden_state = self.enorm(nextn_hidden_state)
 
-        hidden_states = self.eh_proj(paddle.concat([nextn_hidden_state, hidden_states], axis=-1))
+        concat_h = paddle.concat([nextn_hidden_state, hidden_states], axis=-1)
+        hidden_states = FP8LinearFunction.apply(concat_h, self.eh_proj)
 
         # attention compute
         hidden_states, residual = self.self_attn_compute(hidden_states)
diff --git a/paddlenlp/transformers/fp8_utils.py b/paddlenlp/transformers/fp8_utils.py
@@ -50,6 +50,10 @@ def swiglu(x, y=None):
 ]
 
 
+def get_sm_num():
+    return 112
+
+
 def set_parameter_color(
     parameters, color, group=None, offline_quant_expert_weight=True, clear_origin_weight_when_offline_quant=True
 ):
@@ -159,7 +163,7 @@ def padding_and_quant_input(tensor):
             tensor_t_fp8, tensor_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
                 tensor,
                 output_scale_transpose=True,
-                tquant_method="1x128",
+                quant_method="1x128",
                 input_transpose=True,
                 return_transpose_only=True,
             )
@@ -178,7 +182,7 @@ def kitchen_gemm(
             if out is None:
                 out = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], rtn_dtype)
             if numpy.prod(x_fp8.shape) != 0 and numpy.prod(w_fp8.shape) != 0:
-                deep_gemm.wgrad_gemm_fp8_fp8_fp32_nt((x_fp8, x_scale), (w_fp8, w_scale), out, num_sms=118)
+                deep_gemm.wgrad_gemm_fp8_fp8_fp32_nt((x_fp8, x_scale), (w_fp8, w_scale), out, num_sms=get_sm_num())
             return out
 
         if out is not None:
@@ -261,7 +265,9 @@ def compute_fp8_linear(
         if out is None:
             out = paddle.empty([input_fp8.shape[0], weight_fp8.shape[0]], dtype=weight.dtype)
 
-        deep_gemm.gemm_fp8_fp8_bf16_nt((input_fp8, input_scale.T), (weight_fp8, weight_scale), out, num_sms=118)
+        deep_gemm.gemm_fp8_fp8_bf16_nt(
+            (input_fp8, input_scale.T), (weight_fp8, weight_scale), out, num_sms=get_sm_num()
+        )
 
         # Return outputs
         if return_mode == "output_only":
@@ -351,7 +357,7 @@ def common_fp8_mlp_bwd(
             # Recompute o1 using deep_gemm(x_fp8, w1_t_fp8)
             w1_fp8, w1_scale = weight_quant(w1, True)
             o1 = paddle.empty([x_fp8.shape[0], w1_fp8.shape[0]], dtype=do3.dtype)
-            deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1, num_sms=118)
+            deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1, num_sms=get_sm_num())
 
         # ===== [recompute] o2 = swiglu(o1) =====
         o2 = swiglu(o1)
@@ -838,7 +844,7 @@ def split_group_gemm(x_fp8, x_scale, w_fp8, w_scale, tokens_per_expert, gemm_out
             (x_fp8[start_idx:end_idx], x_scale_tma_align),
             (w_fp8[i], w_scale[i]),
             gemm_out[start_idx:end_idx],
-            num_sms=118,
+            num_sms=get_sm_num(),
         )
 
         start_idx = end_idx
@@ -927,7 +933,7 @@ def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, m_indices=Non
                     (w1_t_quant, w1_t_scale),
                     o1,
                     m_indices=self.m_indices if m_indices is None else m_indices,
-                    num_sms=118,
+                    num_sms=get_sm_num(),
                 )
 
         if m_indices is None:
@@ -981,7 +987,7 @@ def fwd_down(
                     (w2_quant, w2_scale),
                     o3,
                     m_indices=m_indices if self.fwd_subbatch else self.m_indices,
-                    num_sms=118,
+                    num_sms=get_sm_num(),
                 )
 
         return o3
@@ -1022,7 +1028,7 @@ def bwd_dowm_input(self, expert_w2, unzipped_grad, o1, tokens_per_expert, m_indi
                     (bw_w2_quant, bw_w2_scale),
                     do2_s,
                     m_indices=m_indices if self.bwd_subbatch else self.m_indices,
-                    num_sms=118,
+                    num_sms=get_sm_num(),
                 )
 
         with paddle.amp.auto_cast(False):
@@ -1068,7 +1074,7 @@ def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, m_indices=None, d
                     (bw_w1_quant, bw_w1_scale),
                     dx,
                     m_indices=m_indices if self.bwd_subbatch else self.m_indices,
-                    num_sms=118,
+                    num_sms=get_sm_num(),
                 )
 
         return dx