add final config (#10933)

zhangbo9674 · web-flow · commit 72dc153e6934 · 2025-08-12T16:20:28.000+08:00
diff --git a/llm/config/deepseek-v3/pretrain_argument.json b/llm/config/deepseek-v3/pretrain_argument.json
@@ -4,20 +4,20 @@
   "input_dir": "./data",
   "output_dir": "./checkpoints/pretrain_ckpts",
   "per_device_train_batch_size": 1,
-  "gradient_accumulation_steps": 24,
+  "gradient_accumulation_steps": 60,
   "per_device_eval_batch_size": 1,
   "tensor_parallel_degree": 1,
-  "pipeline_parallel_degree": 4,
+  "pipeline_parallel_degree": 8,
   "pipeline_parallel_config": "use_dualpipev",
-  "sharding_parallel_degree": 2,
+  "sharding_parallel_degree": 64,
   "sharding_parallel_config": "split_param enable_fuse_optimizer_states",
   "sharding_comm_buffer_size_MB": 4096,
-  "expert_parallel_degree": 2,
+  "expert_parallel_degree": 64,
   "sharding": "stage1",
   "virtual_pp_degree": 1,
   "sequence_parallel": 0,
   "use_flash_attention": true,
-  "max_seq_length": 4096,
+  "max_seq_length": 4097,
   "learning_rate": 3e-05,
   "min_learning_rate": 3e-06,
   "warmup_steps": 30,
@@ -44,5 +44,10 @@
   "skip_profile_timer": false,
   "use_fused_rms_norm": true,
   "fuse_attention_ffn": true,
-  "use_fused_rope": true
+  "use_fused_rope": true,
+  "save_sharded_model": false,
+  "load_sharded_model": false,
+  "unified_checkpoint": true,
+  "use_expert_parallel": true,
+  "unified_checkpoint_config": "skip_save_model_weight"
 }
diff --git a/llm/model_config/DeepSeek-V3/config.json b/llm/model_config/DeepSeek-V3/config.json
@@ -24,14 +24,14 @@
     "moe_intermediate_size": 2048,
     "moe_layer_freq": 1,
     "n_group": 8,
-    "n_routed_experts": 8,
+    "n_routed_experts": 256,
     "n_shared_experts": 1,
     "norm_topk_prob": true,
     "num_attention_heads": 128,
     "num_experts_per_tok": 8,
-    "num_hidden_layers": 13,
+    "num_hidden_layers": 61,
     "num_key_value_heads": 128,
-    "num_nextn_predict_layers": 0,
+    "num_nextn_predict_layers": 1,
     "pretraining_tp": 1,
     "q_lora_rank": 1536,
     "qk_nope_head_dim": 128,
@@ -64,7 +64,10 @@
     "fuse_attention_ffn": true,
     "use_fused_rope": true,
     "token_drop_steps": 0,
-    "recompute_fwd_gate_up": false,
-    "is_split_group_gemm": true,
-    "use_dualpipev": true
+    "recompute_fwd_gate_up": true,
+    "adaptive_remained_O1_recompute_ratio": 2.0,
+    "using_post_norm_recompute": true,
+    "is_split_group_gemm": false,
+    "use_dualpipev": true,
+    "send_mtp_embed": false
   }
diff --git a/llm/script/selective_launch.py b/llm/script/selective_launch.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Selective launch script.
+
+Usage: python script/selective_launch.py <port> <ranks> <ranks> <ranks> ...
+"""
+import os
+import sys
+
+
+def parse_ranks(ranks_strs):
+    """
+    parse_ranks
+    """
+    # NOTE: You can return ranks directly here to change script/train_gpu.sh
+    # and script/kill_process.sh together
+
+    # Example 1: Use contiguous nodes [8, 16)
+    # return range(8, 16)
+
+    # Example 2: Use non-contiguous nodes [4, 8) + {10} + [30, 32), i.e., [4, 5, 6, 7, 10, 30, 31]
+    # return list(range(0, 16)) + list(range(24, 40))
+
+    # Example 3:
+    # Just Python code, return any nodes you want!
+
+    if not ranks_strs:
+        return None
+
+    ranks = []
+    for r in ranks_strs:
+        r = eval(r)
+        if isinstance(r, int):
+            ranks.append(r)
+        else:
+            ranks.extend(r)
+    return ranks
+
+
+def main(port, ranks):
+    """
+    main
+    """
+    ips = [ip.strip() for ip in os.getenv("TRAINER_INSTANCES").split(",") if ip.strip()]
+    if ranks is None:
+        ranks = list(range(len(ips)))
+    ranks = sorted(list(set(ranks)))
+    my_rank = int(os.getenv("POD_INDEX", "0"))
+    if my_rank not in ranks:
+        return
+
+    rank = ranks.index(my_rank)
+    nranks = len(ranks)
+
+    master = ips[ranks[0]]
+    print(f"--master {master}:{port} --rank {rank} --nnodes {nranks}")
+
+
+if __name__ == "__main__":
+    main(int(sys.argv[1]), parse_ranks(sys.argv[2:]))
diff --git a/llm/script/train_gpu.sh b/llm/script/train_gpu.sh
@@ -35,46 +35,32 @@ export NVSHMEM_IB_TRAFFIC_CLASS=162
 #export NVSHMEM_IB_ENABLE_IBGDA=true
 ##export NVSHMEM_DISABLE_P2P=1
 export NVSHMEM_BOOTSTRAP=UID
-export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME==eth0
 
-export FLAGS_cudnn_deterministic=1
-export FLAGS_embedding_deterministic=1
+unset NVSHMEM_HCA_LIST 
+unset NVSHMEM_ENABLE_NIC_PE_MAPPING
 
-# Use nodes in the range [START_RANK, END_RANK)
-START_RANK=0
-END_RANK=1
-
-if [[ $rank -lt $START_RANK ]]; then
-    exit 0
-fi
-
-if [[ $rank -ge $END_RANK ]]; then
+LAUNCH_CMD=`python script/selective_launch.py 36677`
+if [[ -z "$LAUNCH_CMD" ]]; then
     exit 0
 fi
 
-rank=$(($rank-$START_RANK))
-nnodes=$(($END_RANK-$START_RANK))
-
-master=`hostname -i`
-port=36679
 export PYTHONPATH=../:$PYTHONPATH
-export PATH=/opt/nvidia/nsight-systems/2025.1.1/bin/:$PATH
+export CUDA_PATH=/usr/local/cuda-12.9
 
 export DSV3_USE_FP8_GEMM=true
 export DSV3_USE_ATTEN_RECOMPUTE=true
-# export FA_VERSION=3
-export CUDA_PATH=/usr/local/cuda-12.9
+export FA_VERSION=3
 export FLAGS_share_tensor_for_grad_tensor_holder=1
-export DSV3_USE_FP8_DISPATCH=False
+export FLAGS_use_default_stream=false
+export DSV3_USE_FP8_DISPATCH=true
+export USE_DS_GEMM=false
+
 
 bash script/kill_process.sh 
 
-# /opt/nvidia/nsight-compute/2025.2.0/host/target-linux-x64/nsys profile --stats=true -t cuda,nvtx -o fp8_overlap_quant --force-overwrite true \
 python3.10 -m paddle.distributed.launch \
     --log_dir output/paddle_distributed_logs \
-    --master $master:$port \
-    --nnodes $nnodes \
-    --rank $rank \
+    $LAUNCH_CMD \
     --run_mode=collective \
     ${script:-run_pretrain.py}  \
     $@