diff --git a/examples/ascend/multi-node/megatron/node1.sh b/examples/ascend/multi-node/megatron/node1.sh new file mode 100644 index 0000000000..6a027e69a7 --- /dev/null +++ b/examples/ascend/multi-node/megatron/node1.sh @@ -0,0 +1,33 @@ +# Atlas A2 * 2 nodes * 8 cards per node + +ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +NNODES=2 \ +NODE_RANK=0 \ +MASTER_ADDR=127.0.0.1 \ +MASTER_PORT=29500 \ +NPROC_PER_NODE=8 \ +HCCL_SOCKET_IFNAME=xxx \ +megatron sft \ + --model 'Qwen/Qwen3-8B' \ + --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \ + --save './SAVE' \ + --train_type 'lora' \ + --lora_rank 8 \ + --lora_alpha 32 \ + --target_modules 'all-linear' \ + --tensor_model_parallel_size 2 \ + --pipeline_model_parallel_size 1 \ + --context_parallel_size 1 \ + --sequence_parallel true \ + --micro_batch_size 1 \ + --global_batch_size 64 \ + --recompute_granularity selective \ + --recompute_modules core_attn \ + --cross_entropy_loss_fusion true \ + --no_gradient_accumulation_fusion true \ + --lr 1e-4 \ + --lr_warmup_fraction 0.05 \ + --min_lr 1e-5 \ + --max_epochs 1 \ + --log_interval 5 \ + --num_workers 4 diff --git a/examples/ascend/multi-node/megatron/node2.sh b/examples/ascend/multi-node/megatron/node2.sh new file mode 100644 index 0000000000..85bdf76326 --- /dev/null +++ b/examples/ascend/multi-node/megatron/node2.sh @@ -0,0 +1,33 @@ +# Atlas A2 * 2 nodes * 8 cards per node + +ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +NNODES=2 \ +NODE_RANK=1 \ +MASTER_ADDR=xxx.xxx.xxx.xxx \ +MASTER_PORT=29500 \ +NPROC_PER_NODE=8 \ +HCCL_SOCKET_IFNAME=xxx \ +megatron sft \ + --model 'Qwen/Qwen3-8B' \ + --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \ + --save './SAVE' \ + --train_type 'lora' \ + --lora_rank 8 \ + --lora_alpha 32 \ + --target_modules 'all-linear' \ + --tensor_model_parallel_size 2 \ + --pipeline_model_parallel_size 1 \ + --context_parallel_size 1 \ + --sequence_parallel true \ + --micro_batch_size 1 \ + --global_batch_size 64 \ + --recompute_granularity selective \ + --recompute_modules core_attn \ + --cross_entropy_loss_fusion true \ + --no_gradient_accumulation_fusion true \ + --lr 1e-4 \ + --lr_warmup_fraction 0.05 \ + --min_lr 1e-5 \ + --max_epochs 1 \ + --log_interval 5 \ + --num_workers 4 diff --git a/examples/ascend/train/qwen3_lora_deepspeed/train.sh b/examples/ascend/train/qwen3/qwen3_lora_deepspeed.sh old mode 100644 new mode 100755 similarity index 100% rename from examples/ascend/train/qwen3_lora_deepspeed/train.sh rename to examples/ascend/train/qwen3/qwen3_lora_deepspeed.sh diff --git a/examples/ascend/train/qwen3_lora_fsdp/fsdp.json b/examples/ascend/train/qwen3/qwen3_lora_fsdp/fsdp.json old mode 100644 new mode 100755 similarity index 100% rename from examples/ascend/train/qwen3_lora_fsdp/fsdp.json rename to examples/ascend/train/qwen3/qwen3_lora_fsdp/fsdp.json diff --git a/examples/ascend/train/qwen3_lora_fsdp/train.sh b/examples/ascend/train/qwen3/qwen3_lora_fsdp/train.sh old mode 100644 new mode 100755 similarity index 100% rename from examples/ascend/train/qwen3_lora_fsdp/train.sh rename to examples/ascend/train/qwen3/qwen3_lora_fsdp/train.sh diff --git a/examples/ascend/train/qwen3_lora_megatron/dense_npu.sh b/examples/ascend/train/qwen3/qwen3_lora_megatron.sh old mode 100644 new mode 100755 similarity index 92% rename from examples/ascend/train/qwen3_lora_megatron/dense_npu.sh rename to examples/ascend/train/qwen3/qwen3_lora_megatron.sh index 897495c15e..74747024ec --- a/examples/ascend/train/qwen3_lora_megatron/dense_npu.sh +++ b/examples/ascend/train/qwen3/qwen3_lora_megatron.sh @@ -1,7 +1,7 @@ NPROC_PER_NODE=2 \ ASCEND_RT_VISIBLE_DEVICES=0,1 \ megatron sft \ - --model Qwen/Qwen2.5-7B-Instruct \ + --model Qwen/Qwen3-8B \ --load_safetensors true \ --save_safetensors true \ --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ @@ -24,7 +24,7 @@ megatron sft \ --lr_warmup_fraction 0.05 \ --min_lr 1e-5 \ --max_epochs 1 \ - --save megatron_output/Qwen2.5-7B-Instruct \ + --save megatron_output/Qwen3-8B \ --save_interval 100 \ --max_length 2048 \ --system 'You are a helpful assistant.' \