modelscope · addsubmuldiv · Jan 8, 2026 · Jan 8, 2026 · gemini-code-assist · Jan 8, 2026
diff --git a/examples/ascend/multi-node/megatron/node1.sh b/examples/ascend/multi-node/megatron/node1.sh
@@ -0,0 +1,33 @@
+# Atlas A2 * 2 nodes * 8 cards per node
+
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NNODES=2 \
+NODE_RANK=0 \
+MASTER_ADDR=127.0.0.1 \
-MASTER_ADDR=127.0.0.1 \
+MASTER_ADDR=xxx.xxx.xxx.xxx \
-MASTER_ADDR=127.0.0.1 \
+MASTER_ADDR=xxx.xxx.xxx.xxx \
+MASTER_PORT=29500 \
+NPROC_PER_NODE=8 \
-NPROC_PER_NODE=8 \
+NPROC_PER_NODE=8 \
+# Replace 'xxx' with your actual network interface name (e.g., eth0, ens33).
-NPROC_PER_NODE=8 \
+NPROC_PER_NODE=8 \
+# Replace 'xxx' with your actual network interface name (e.g., eth0, ens33).
+HCCL_SOCKET_IFNAME=xxx \
+megatron sft \
+    --model 'Qwen/Qwen3-8B' \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \
+    --save './SAVE' \
+    --train_type 'lora' \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules 'all-linear' \
+    --tensor_model_parallel_size 2 \
+    --pipeline_model_parallel_size 1 \
+    --context_parallel_size 1 \
+    --sequence_parallel true \
+    --micro_batch_size 1 \
+    --global_batch_size 64 \
+    --recompute_granularity selective \
+    --recompute_modules core_attn \
+    --cross_entropy_loss_fusion true \
+    --no_gradient_accumulation_fusion true \
+    --lr 1e-4 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-5 \
+    --max_epochs 1 \
+    --log_interval 5 \
+    --num_workers 4
diff --git a/examples/ascend/multi-node/megatron/node2.sh b/examples/ascend/multi-node/megatron/node2.sh
@@ -0,0 +1,33 @@
+# Atlas A2 * 2 nodes * 8 cards per node
+
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NNODES=2 \
+NODE_RANK=1 \
+MASTER_ADDR=xxx.xxx.xxx.xxx \
+MASTER_PORT=29500 \
+NPROC_PER_NODE=8 \
+HCCL_SOCKET_IFNAME=xxx \
+megatron sft \
+    --model 'Qwen/Qwen3-8B' \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \
+    --save './SAVE' \
+    --train_type 'lora' \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules 'all-linear' \
+    --tensor_model_parallel_size 2 \
+    --pipeline_model_parallel_size 1 \
+    --context_parallel_size 1 \
+    --sequence_parallel true \
+    --micro_batch_size 1 \
+    --global_batch_size 64 \
+    --recompute_granularity selective \
+    --recompute_modules core_attn \
+    --cross_entropy_loss_fusion true \
+    --no_gradient_accumulation_fusion true \
+    --lr 1e-4 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-5 \
+    --max_epochs 1 \
+    --log_interval 5 \
+    --num_workers 4
diff --git a/...scend/train/qwen3_lora_deepspeed/train.sh → ...scend/train/qwen3/qwen3_lora_deepspeed.sh b/...scend/train/qwen3_lora_deepspeed/train.sh → ...scend/train/qwen3/qwen3_lora_deepspeed.sh
diff --git a/...es/ascend/train/qwen3_lora_fsdp/fsdp.json → ...end/train/qwen3/qwen3_lora_fsdp/fsdp.json b/...es/ascend/train/qwen3_lora_fsdp/fsdp.json → ...end/train/qwen3/qwen3_lora_fsdp/fsdp.json
diff --git a/...les/ascend/train/qwen3_lora_fsdp/train.sh → ...cend/train/qwen3/qwen3_lora_fsdp/train.sh b/...les/ascend/train/qwen3_lora_fsdp/train.sh → ...cend/train/qwen3/qwen3_lora_fsdp/train.sh
diff --git a/...nd/train/qwen3_lora_megatron/dense_npu.sh → ...ascend/train/qwen3/qwen3_lora_megatron.sh b/...nd/train/qwen3_lora_megatron/dense_npu.sh → ...ascend/train/qwen3/qwen3_lora_megatron.sh
@@ -1,7 +1,7 @@
 NPROC_PER_NODE=2 \
 ASCEND_RT_VISIBLE_DEVICES=0,1 \
 megatron sft \
-    --model Qwen/Qwen2.5-7B-Instruct \
+    --model Qwen/Qwen3-8B \
     --load_safetensors true \
     --save_safetensors true \
     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
@@ -24,7 +24,7 @@ megatron sft \
     --lr_warmup_fraction 0.05 \
     --min_lr 1e-5 \
     --max_epochs 1 \
-    --save megatron_output/Qwen2.5-7B-Instruct \
+    --save megatron_output/Qwen3-8B \
     --save_interval 100 \
     --max_length 2048 \
     --system 'You are a helpful assistant.' \