diff --git a/llm/auto_parallel/galvatron/scripts/profile_all2all.sh b/llm/auto_parallel/galvatron/scripts/profile_all2all.sh
new file mode 100644
index 000000000000..bb18754ef915
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/profile_all2all.sh
@@ -0,0 +1,115 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
+export NCCL_IB_DISABLE=0
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py  --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1
+sleep 1
diff --git a/llm/auto_parallel/galvatron/scripts/profile_allreduce.sh b/llm/auto_parallel/galvatron/scripts/profile_allreduce.sh
new file mode 100644
index 000000000000..d3774efd018e
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/profile_allreduce.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
+export NCCL_IB_DISABLE=0
+echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 0 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json "
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 0 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json
+sleep 1
+echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 0 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json "
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 0 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json
+sleep 1
+echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 0 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json "
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 0 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json
+sleep 1
+rm -r ./profiler_log
\ No newline at end of file
diff --git a/llm/auto_parallel/galvatron/scripts/profile_allreduce_sp.sh b/llm/auto_parallel/galvatron/scripts/profile_allreduce_sp.sh
new file mode 100644
index 000000000000..d5120c461f8a
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/profile_allreduce_sp.sh
@@ -0,0 +1,116 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
+export NCCL_IB_DISABLE=0
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2
+sleep 1
+echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1"
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py  --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1
+sleep 1
+rm -r ./profiler_log
\ No newline at end of file
diff --git a/llm/auto_parallel/galvatron/scripts/profile_computation.sh b/llm/auto_parallel/galvatron/scripts/profile_computation.sh
new file mode 100644
index 000000000000..a0e291ab9ee8
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/profile_computation.sh
@@ -0,0 +1,160 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+unset CUDA_VISIBLE_DEVICES
+
+unset PADDLE_ELASTIC_JOB_ID
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+unset FLAGS_START_PORT
+unset PADDLE_ELASTIC_TIMEOUT
+unset PADDLE_TRAINERS_NUM
+unset PADDLE_TRAINER_ID
+unset PADDLE_WORKERS_IP_PORT_LIST
+unset PADDLE_TRAINERS
+unset PADDLE_NUM_GRADIENT_SERVERS
+
+source <path_to_your_own_python>
+
+task_name="qwen"
+dir_name="profile_computation"
+rm -rf output/$dir_name/$task_name/
+rm -rf "output/$dir_name/$task_name""_log"
+
+export SOT_LOG_LEVEL=4
+export PYTHONPATH=../../../:$PYTHONPATH
+
+TRAINER="./train_qwen.py"
+LAUNCHER="python -u -m paddle.distributed.launch"
+LAUNCHER="${LAUNCHER} --gpus 7"  # 设置需要使用的GPU
+LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output""
+
+export LAUNCHER=$LAUNCHER
+
+# [max_steps] [logging_steps] [enable_auto_parallel]
+TRAIN_ARGS="
+    --weight_decay 0.01 \
+    --warmup_ratio 0.01 \
+    --max_grad_norm 1.0 \
+    --learning_rate 3e-05 \
+    --min_learning_rate 3e-06 \
+    --max_steps 25 \
+    --logging_steps 1 \
+    --continue_training 0 \
+    --do_train true \
+    --do_eval false \
+    --do_predict false \
+    --disable_tqdm true \
+    --skip_profile_timer false \
+    --skip_memory_metrics 0 \
+    --save_total_limit 2 \
+    --device gpu \
+    --dataloader_num_workers 1 \
+    --distributed_dataloader 0 \
+    --enable_auto_parallel 1 \
+"
+
+# [seq_length] [num_hidden_layers]
+MODEL_ARGS="
+    --model_name_or_path "llama" \
+    --tokenizer_name_or_path "llama" \
+    --num_hidden_layers 2 \
+    --intermediate_size 25600 \
+    --vocab_size 32000 \
+    --hidden_size 5120 \
+    --seq_length 1024 \
+    --num_attention_heads 64 \
+    --num_key_value_heads 8 \
+"
+
+# [mbsz, accumulation_steps] [recompute] [amp]
+CONFIG_ARGS="
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --recompute true \
+    --recompute_use_reentrant true \
+    --recompute_granularity full \
+    --pp_recompute_interval 0 \
+    --bf16 true \
+    --fp16_opt_level "O2" \
+    --amp_master_grad true \
+    --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
+    --amp_custom_white_list "lookup_table" "lookup_table_v2" \
+"
+
+# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs]
+PARALLEL_ARGS=(
+    --to_static 0
+    --sharding_parallel_degree 1
+    --sharding "stage2"
+    --tensor_parallel_degree 2
+    --sequence_parallel true
+    --pipeline_parallel_degree 2
+    --virtual_pp_degree 1
+    --pipeline_schedule_mode "1F1B"
+    --sep_parallel_degree 1
+    --pipeline_parallel_config "enable_send_recv_overlap"
+    --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate"
+    --sharding_parallel_config "enable_overlap enable_release_grads"
+    --tensor_parallel_config "enable_mp_async_allreduce replace_with_parallel_cross_entropy"
+)
+
+# [fused] [flash_attention]
+DEFAULT_OPTIMIZER="
+    --fuse_attention_ffn true \
+    --fuse_attention_qkv true \
+    --fused_linear_param_grad_add 1 \
+    --fuse_sequence_parallel_allreduce true \
+    --use_flash_attention true \
+    --use_fused_rope true \
+    --use_fused_rms_norm false \
+    --enable_linear_fused_grad_add true \
+"
+
+# [data]
+DATA_ARGS="
+    --input_dir ./data \
+    --split 949,50,1 \
+    --max_seq_length 16384"
+
+# [runtime profiler]
+RUNTIME_PROFILE_ARGS="
+    --profile_time_flag 1 \
+    --profile_forward_only 1 \
+    --save_time_flag 1 \
+"
+
+# [model profiler] [sequence type]
+MODEL_PROFILER_ARGS="
+    --profile_type computation \
+    --profile_mode sequence \
+    --profile_fixed_batch_size 1 \
+    --layernum_min 1 \
+    --layernum_max 2 \
+    --profile_min_seq_length 4096 \
+    --profile_max_seq_length 16384 \
+    --profile_seq_length_step 4096 \
+    --num_layertype 1 \
+"
+
+python ./profile.py \
+    $MODEL_ARGS \
+    $TRAIN_ARGS \
+    $CONFIG_ARGS \
+    "${PARALLEL_ARGS[@]}" \
+    $DEFAULT_OPTIMIZER \
+    $DATA_ARGS \
+    $RUNTIME_PROFILE_ARGS \
+    $MODEL_PROFILER_ARGS
\ No newline at end of file
diff --git a/llm/auto_parallel/galvatron/scripts/profile_hardware.sh b/llm/auto_parallel/galvatron/scripts/profile_hardware.sh
new file mode 100644
index 000000000000..865f7c812adb
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/profile_hardware.sh
@@ -0,0 +1,32 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+launch="${interpreter} -u -m paddle.distributed.launch"
+launch="${launch} --master $master:$port --nnodes $nnodes --rank $rank --gpus 0,1,2,3,4,5,6,7"
+
+export INTERPRETER=${interpreter}
+export LAUNCHER=${launch}
+export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
+export NCCL_IB_DISABLE=0
+
+PROFILE_HARDWARE_ARGS=(
+    --num_nodes $nnodes
+    --num_gpus_per_node 8
+    --backend 'paddle'
+    --max_pp_deg 8
+    --max_tp_deg 8
+)
+
+${interpreter} profile_hardware.py \
+    "${PROFILE_HARDWARE_ARGS[@]}"
\ No newline at end of file
diff --git a/llm/auto_parallel/galvatron/scripts/profile_memory.sh b/llm/auto_parallel/galvatron/scripts/profile_memory.sh
new file mode 100644
index 000000000000..2aa615f7eb9c
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/profile_memory.sh
@@ -0,0 +1,161 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+unset CUDA_VISIBLE_DEVICES
+
+unset PADDLE_ELASTIC_JOB_ID
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+unset FLAGS_START_PORT
+unset PADDLE_ELASTIC_TIMEOUT
+unset PADDLE_TRAINERS_NUM
+unset PADDLE_TRAINER_ID
+unset PADDLE_WORKERS_IP_PORT_LIST
+unset PADDLE_TRAINERS
+unset PADDLE_NUM_GRADIENT_SERVERS
+
+source <path_to_your_own_python>
+
+task_name="qwen_profile_memory"
+dir_name="profile_memory"
+rm -rf output/$dir_name/$task_name/
+rm -rf "output/$dir_name/$task_name""_log"
+
+export SOT_LOG_LEVEL=4
+export PYTHONPATH=../../../:$PYTHONPATH
+
+TRAINER="./train_qwen.py"
+LAUNCHER="python -u -m paddle.distributed.launch"
+LAUNCHER="${LAUNCHER} --gpus 0,1,2,3,4,5,6,7"  # 设置需要使用的GPU
+LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output""
+
+export LAUNCHER=$LAUNCHER
+export PROFILE_WORLD_SIZE=8
+
+# [max_steps] [logging_steps] [enable_auto_parallel]
+TRAIN_ARGS="
+    --weight_decay 0.01 \
+    --warmup_ratio 0.01 \
+    --max_grad_norm 1.0 \
+    --learning_rate 3e-05 \
+    --min_learning_rate 3e-06 \
+    --max_steps 10 \
+    --logging_steps 1 \
+    --continue_training 0 \
+    --do_train true \
+    --do_eval false \
+    --do_predict false \
+    --disable_tqdm true \
+    --skip_profile_timer false \
+    --skip_memory_metrics 0 \
+    --save_total_limit 2 \
+    --device gpu \
+    --dataloader_num_workers 1 \
+    --distributed_dataloader 0 \
+    --enable_auto_parallel 1 \
+"
+
+# [seq_length] [num_hidden_layers]
+MODEL_ARGS="
+    --model_name_or_path "llama" \
+    --tokenizer_name_or_path "llama" \
+    --num_hidden_layers 2 \
+    --intermediate_size 25600 \
+    --vocab_size 32000 \
+    --hidden_size 5120 \
+    --seq_length 1024 \
+    --num_attention_heads 64 \
+    --num_key_value_heads 8 \
+"
+
+# [mbsz, accumulation_steps] [recompute] [amp]
+CONFIG_ARGS="
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 1 \
+    --recompute false \
+    --recompute_use_reentrant true \
+    --recompute_granularity full \
+    --pp_recompute_interval 0 \
+    --bf16 true \
+    --fp16_opt_level "O2" \
+    --amp_master_grad true \
+    --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
+    --amp_custom_white_list "lookup_table" "lookup_table_v2" \
+"
+
+# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs]
+PARALLEL_ARGS=(
+    --to_static 0
+    --sharding_parallel_degree 1
+    --sharding "stage2"
+    --tensor_parallel_degree 2
+    --sequence_parallel true
+    --pipeline_parallel_degree 2
+    --virtual_pp_degree 1
+    --pipeline_schedule_mode "1F1B"
+    --sep_parallel_degree 1 
+    --pipeline_parallel_config "enable_send_recv_overlap"
+    --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate"
+    --sharding_parallel_config "enable_overlap"
+    --tensor_parallel_config "enable_mp_async_allreduce"
+)
+
+# [fused] [flash_attention]
+DEFAULT_OPTIMIZER_ARGS="
+    --fuse_attention_ffn true \
+    --fuse_attention_qkv true \
+    --fused_linear_param_grad_add 1 \
+    --fuse_sequence_parallel_allreduce true \
+    --use_flash_attention true \
+    --use_fused_rope true \
+    --use_fused_rms_norm true \
+    --enable_linear_fused_grad_add true \
+"
+
+# [data]
+DATA_ARGS="
+    --input_dir ./data \
+    --split 949,50,1 \
+    --max_seq_length 16384"
+
+# [runtime profiler]
+RUNTIME_PROFILE_ARGS="
+    --profile_memory_flag 1 \
+    --save_memory_flag 1 \
+"
+
+# [model profiler] [static type]
+MODEL_PROFILER_ARGS="
+    --profile_type memory \
+    --profile_mode static \
+    --profile_fixed_batch_size 8 \
+    --layernum_min 1 \
+    --layernum_max 2 \
+    --profile_fixed_seq_length_list 16384 \
+    --num_layertype 1 \
+    --max_tp_deg 8 \
+    --max_per_device_train_batch_size 4 \
+"
+
+python ./profile.py \
+    $MODEL_ARGS \
+    $TRAIN_ARGS \
+    $CONFIG_ARGS \
+    "${PARALLEL_ARGS[@]}" \
+    $DEFAULT_OPTIMIZER_ARGS \
+    $DATA_ARGS \
+    $RUNTIME_PROFILE_ARGS \
+    $MODEL_PROFILER_ARGS
+    
\ No newline at end of file
diff --git a/llm/auto_parallel/galvatron/scripts/profile_overlap.sh b/llm/auto_parallel/galvatron/scripts/profile_overlap.sh
new file mode 100644
index 000000000000..4acc2133bece
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/profile_overlap.sh
@@ -0,0 +1,20 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
+export NCCL_IB_DISABLE=0
+echo "Running python3 -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 --log_dir output/profile_overlap ./paddlenlp/experimental/galvatron/profiler/profile_overlap.py --output_dir "./output""
+python3 -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 --log_dir output/profile_overlap ./paddlenlp/experimental/galvatron/profiler/profile_overlap.py --output_dir "./output"
+sleep 1
+rm -r ./profiler_log
\ No newline at end of file
diff --git a/llm/auto_parallel/galvatron/scripts/profile_p2p.sh b/llm/auto_parallel/galvatron/scripts/profile_p2p.sh
new file mode 100644
index 000000000000..b8fa0c79f062
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/profile_p2p.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
+export NCCL_IB_DISABLE=0
+echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json "
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json
+sleep 1
+echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json "
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json
+sleep 1
+echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json "
+python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json
+sleep 1
+rm -r ./profiler_log
\ No newline at end of file
diff --git a/llm/auto_parallel/galvatron/scripts/search_dist.sh b/llm/auto_parallel/galvatron/scripts/search_dist.sh
new file mode 100644
index 000000000000..80d75b2aa228
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/search_dist.sh
@@ -0,0 +1,66 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+
+unset PADDLE_ELASTIC_JOB_ID
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+unset FLAGS_START_PORT
+unset PADDLE_ELASTIC_TIMEOUT
+unset PADDLE_TRAINERS_NUM
+unset PADDLE_TRAINER_ID
+unset PADDLE_WORKERS_IP_PORT_LIST
+unset PADDLE_TRAINERS
+unset PADDLE_NUM_GRADIENT_SERVERS
+
+source <path_to_your_own_python>
+
+export PYTHONPATH=../../..:$PYTHONPATH
+
+ProfileDataParserArgs="
+    --time_profile_mode sequence \
+    --memory_profile_mode static \
+    --num_layertype 1 \
+    --hidden_size_list 5120 \
+    --layernum_list 72 \
+    --seqlen_list 32768 \
+    --profile_gpu_num 64 \
+    --time_profile_data_path ./configs/computation_profiling_bf16_llama_rank[0].json \
+    --memory_profile_data_path ./configs/memory_profiling_bf16_llama.json \
+    --overlap_coe_path ./configs/overlap_coefficient.json \
+    --allreduce_coe_path ./configs/allreduce_bandwidth_8nodes_8gpus_per_node.json \
+    --p2p_coe_path ./configs/p2p_bandwidth_8nodes_8gpus_per_node.json \
+    --sp_time_path ./configs/sp_time_8nodes_8gpus_per_node.json \
+"
+
+SearchEngineArgs="
+    --search_granularity fine-grained \
+    --world_size 64 \
+    --min_bsz 64 \
+    --max_bsz 64 \
+    --bsz_step 1 \
+    --max_tp_size 8 \
+    --max_pp_size 8 \
+    --mixed_precision_type bf16 \
+    --memory_upper_limit 95 \
+    --sp_space tp \
+    --layernum 72 \
+    --disable_sdp 0 \
+    --disable_vtp 0 \
+    --parallel_search 0 \
+    --log_dir ./search-engine-logs \
+"
+
+python ./search_dist.py ${ProfileDataParserArgs} ${SearchEngineArgs}
\ No newline at end of file
diff --git a/llm/auto_parallel/galvatron/scripts/train_qwen.sh b/llm/auto_parallel/galvatron/scripts/train_qwen.sh
new file mode 100644
index 000000000000..de2e20522a67
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/train_qwen.sh
@@ -0,0 +1,143 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+unset CUDA_VISIBLE_DEVICES
+
+task_name="corase_grained"
+dir_name="fine-vs-corase"
+
+export HF_ENDPOINT=https://hf-mirror.com
+rm -rf output/$dir_name/$task_name/
+rm -rf "output/$dir_name/$task_name""_log"
+
+export SOT_LOG_LEVEL=4
+export PYTHONPATH=../../../:$PYTHONPATH
+
+TRAINER="./train_qwen.py"
+LAUNCHER="python -u -m paddle.distributed.launch --log_level DEBUG"
+LAUNCHER="${LAUNCHER} --gpus 0,1,2,3,4,5,6,7" 
+LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output""
+
+# [max_steps] [logging_steps] [enable_auto_parallel]
+TRAIN_ARGS="
+    --weight_decay 0.01 \
+    --warmup_ratio 0.01 \
+    --max_grad_norm 1.0 \
+    --learning_rate 3e-05 \
+    --min_learning_rate 3e-06 \
+    --max_steps 25 \
+    --logging_steps 1 \
+    --continue_training 0 \
+    --do_train true \
+    --disable_tqdm true \
+    --skip_profile_timer false \
+    --skip_memory_metrics 0 \
+    --save_total_limit 2 \
+    --device gpu \
+    --dataloader_num_workers 1 \
+    --distributed_dataloader 0 \
+    --enable_auto_parallel 1 \
+"
+
+# [seq_length] [num_hidden_layers]
+# still need to use llama as model_type
+MODEL_ARGS=(
+    --model_type "llama"
+    --num_hidden_layers 16
+    --intermediate_size 11008
+    --vocab_size 32000
+    --hidden_size 4096
+    --seq_length 1024
+    --num_attention_heads 32
+    --num_key_value_heads 32
+)
+
+# "max_position_embeddings": 32768,
+# [mbsz, accumulation_steps] [recompute] [amp]
+CONFIG_ARGS="
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --recompute false \
+    --recompute_use_reentrant true \
+    --recompute_granularity full \
+    --pp_recompute_interval 0 \
+    --bf16 true \
+    --fp16_opt_level "O2" \
+    --amp_master_grad true \
+    --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
+    --amp_custom_white_list "lookup_table" "lookup_table_v2" \
+"
+
+# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs]
+PARALLEL_ARGS=(
+    --to_static 1
+    --sharding_parallel_degree 2
+    --sharding "stage3"
+    --tensor_parallel_degree 4
+    --sequence_parallel true
+    --pipeline_parallel_degree 1
+    --virtual_pp_degree 1
+    --pipeline_schedule_mode "1F1B"
+    --sep_parallel_degree 1
+    --pipeline_parallel_config "enable_send_recv_overlap"
+    --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate"
+    --sharding_parallel_config "enable_overlap enable_release_grads"
+    --tensor_parallel_config "enable_mp_async_allreduce replace_with_parallel_cross_entropy"
+)
+#     --sharding_parallel_config "enable_overlap enable_release_grads enable_tensor_fusion"
+
+
+# [fused] [flash_attention]
+DEFAULT_OPTIMIZER_ARGS="
+    --fuse_attention_ffn true \
+    --fuse_attention_qkv true \
+    --fused_linear_param_grad_add 1 \
+    --fuse_sequence_parallel_allreduce true \
+    --use_flash_attention true \
+    --use_fused_rope true \
+    --use_fused_rms_norm false \
+    --enable_linear_fused_grad_add true \
+"
+
+    # --use_fast_layer_norm true \
+# [data] max_seq_length equal config.max_position_embeddings
+DATA_ARGS="
+    --input_dir ./data \
+    --split 949,50,1 \
+    --max_seq_length 1024"
+
+# [runtime_profile]
+RUNTIME_PROFILE_ARGS="
+    --profile_time_flag 1 \
+    --profile_memory_flag 1 \
+    --profile_forward_only 0 \
+    --save_time_flag 0 \
+    --save_memory_flag 0 \
+"
+
+# [debug] 
+DEBUG_ARGS="
+    --job_schedule_profiler_start 1 \
+    --job_schedule_profiler_end 5 \
+"   
+
+$LAUNCHER \
+    "${MODEL_ARGS[@]}" \
+    $TRAIN_ARGS \
+    $CONFIG_ARGS \
+    "${PARALLEL_ARGS[@]}" \
+    $DEFAULT_OPTIMIZER_ARGS \
+    $DATA_ARGS \
+    $RUNTIME_PROFILE_ARGS \
\ No newline at end of file
diff --git a/llm/auto_parallel/galvatron/scripts/train_qwen_fine_graine.sh b/llm/auto_parallel/galvatron/scripts/train_qwen_fine_graine.sh
new file mode 100644
index 000000000000..7f616c59783a
--- /dev/null
+++ b/llm/auto_parallel/galvatron/scripts/train_qwen_fine_graine.sh
@@ -0,0 +1,185 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+unset CUDA_VISIBLE_DEVICES
+
+nnodes=$PADDLE_TRAINERS_NUM
+rank=$PADDLE_TRAINER_ID
+
+unset PADDLE_ELASTIC_JOB_ID
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+unset FLAGS_START_PORT
+unset PADDLE_ELASTIC_TIMEOUT
+unset PADDLE_TRAINERS_NUM
+unset PADDLE_TRAINER_ID
+unset PADDLE_WORKERS_IP_PORT_LIST
+unset PADDLE_TRAINERS
+unset PADDLE_NUM_GRADIENT_SERVERS
+
+START_RANK=0
+END_RANK=8
+
+if [[ $rank -lt $START_RANK ]]; then
+    exit 0
+fi
+
+if [[ $rank -ge $END_RANK ]]; then
+    exit 0
+fi
+export rank=$(($rank-$START_RANK))
+export nnodes=$(($END_RANK-$START_RANK))
+master_ip=`cat /root/paddlejob/workspace/hostfile | head -n $(($START_RANK+1)) | tail -n 1 | awk '{print $1}'`
+export master=$master_ip
+export port=36677
+
+export interpreter="<path to your own python>"
+
+task_name="fine_grained_config-with-manual"
+dir_name="fine-vs-corase"
+
+rm -rf output/$dir_name/$task_name/
+rm -rf "output/$dir_name/$task_name""_log"
+
+export SOT_LOG_LEVEL=4
+export PYTHONPATH=../../../:$PYTHONPATH
+
+TRAINER="./train_qwen_fine_graine.py"
+LAUNCHER="python -u -m paddle.distributed.launch"
+LAUNCHER="${LAUNCHER} --master $master:$port --nnodes $nnodes --rank $rank --gpus 0,1,2,3,4,5,6,7"
+LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output""
+
+# [max_steps] [logging_steps] [enable_auto_parallel]
+TRAIN_ARGS="
+    --weight_decay 0.01 \
+    --warmup_ratio 0.01 \
+    --max_grad_norm 1.0 \
+    --learning_rate 3e-05 \
+    --min_learning_rate 3e-06 \
+    --max_steps 25 \
+    --logging_steps 1 \
+    --continue_training 0 \
+    --do_train true \
+    --disable_tqdm true \
+    --skip_profile_timer false \
+    --skip_memory_metrics 0 \
+    --save_total_limit 2 \
+    --device gpu \
+    --dataloader_num_workers 1 \
+    --distributed_dataloader 0 \
+    --enable_auto_parallel 1 \
+"
+
+# [seq_length] [num_hidden_layers]
+# still need to use llama as model_type
+MODEL_ARGS=(
+    --model_type "llama_fine_grained_final"
+    --num_hidden_layers 72
+    --intermediate_size 25600
+    --vocab_size 32000
+    --hidden_size 5120
+    --seq_length 32768
+    --num_attention_heads 64
+    --num_key_value_heads 8
+)
+
+# "max_position_embeddings": 32768,
+# [mbsz, accumulation_steps] [recompute] [amp]
+CONFIG_ARGS="
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --recompute false \
+    --recompute_use_reentrant true \
+    --recompute_granularity full \
+    --pp_recompute_interval 0 \
+    --bf16 true \
+    --fp16_opt_level "O2" \
+    --amp_master_grad true \
+    --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
+    --amp_custom_white_list "lookup_table" "lookup_table_v2" \
+"
+
+# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs]
+PARALLEL_ARGS=(
+    --to_static 1
+    --sharding_parallel_degree 2
+    --sharding "stage2"
+    --tensor_parallel_degree 4
+    --sequence_parallel true
+    --pipeline_parallel_degree 1
+    --virtual_pp_degree 1
+    --pipeline_schedule_mode "1F1B"
+    --sep_parallel_degree 1
+    --pipeline_parallel_config "enable_send_recv_overlap"
+    --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate"
+    --sharding_parallel_config "enable_overlap enable_release_grads"
+    --tensor_parallel_config "enable_mp_async_allreduce replace_with_parallel_cross_entropy"
+)
+#     --sharding_parallel_config "enable_overlap enable_release_grads enable_tensor_fusion"
+
+
+# [fused] [flash_attention]
+DEFAULT_OPTIMIZER_ARGS="
+    --fuse_attention_ffn true \
+    --fuse_attention_qkv true \
+    --fused_linear_param_grad_add 1 \
+    --fuse_sequence_parallel_allreduce true \
+    --use_flash_attention true \
+    --use_fused_rope true \
+    --use_fused_rms_norm true \
+    --enable_linear_fused_grad_add true \
+"
+
+# [data] max_seq_length equal config.max_position_embeddings
+DATA_ARGS="
+    --input_dir ./data \
+    --split 949,50,1 \
+    --max_seq_length 32768"
+
+# [runtime_profile]
+RUNTIME_PROFILE_ARGS="
+    --profile_time_flag 1 \
+    --profile_memory_flag 1 \
+    --profile_forward_only 0 \
+    --save_time_flag 0 \
+    --save_memory_flag 0 \
+"
+
+# [debug] 
+DEBUG_ARGS="
+    --job_schedule_profiler_start 1 \
+    --job_schedule_profiler_end 5 \
+"   
+
+# [GranularityRuntime]
+GRANULARITY_RUNTIME_ARGS="
+    --granularity_type fine_grained \
+    --usp_flag 0 \
+    --sharding_stage_level 2 \
+    --fine_grained_config_path ./configs/fine_grained_config.json \
+"
+
+bash kill.sh
+sleep 1
+
+$LAUNCHER \
+    "${MODEL_ARGS[@]}" \
+    $TRAIN_ARGS \
+    $CONFIG_ARGS \
+    "${PARALLEL_ARGS[@]}" \
+    $DEFAULT_OPTIMIZER_ARGS \
+    $DATA_ARGS \
+    $RUNTIME_PROFILE_ARGS \
+    $GRANULARITY_RUNTIME_ARGS
\ No newline at end of file