diff --git a/llm/auto_parallel/galvatron/scripts/profile_all2all.sh b/llm/auto_parallel/galvatron/scripts/profile_all2all.sh new file mode 100644 index 000000000000..bb18754ef915 --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/profile_all2all.sh @@ -0,0 +1,115 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5 +export NCCL_IB_DISABLE=0 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_all2all ./paddlenlp/experimental/galvatron/profiler/profile_all2all.py --output_dir "./output" --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1 +sleep 1 diff --git a/llm/auto_parallel/galvatron/scripts/profile_allreduce.sh b/llm/auto_parallel/galvatron/scripts/profile_allreduce.sh new file mode 100644 index 000000000000..d3774efd018e --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/profile_allreduce.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5 +export NCCL_IB_DISABLE=0 +echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json " +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json +sleep 1 +echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json " +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json +sleep 1 +echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json " +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json +sleep 1 +rm -r ./profiler_log \ No newline at end of file diff --git a/llm/auto_parallel/galvatron/scripts/profile_allreduce_sp.sh b/llm/auto_parallel/galvatron/scripts/profile_allreduce_sp.sh new file mode 100644 index 000000000000..d5120c461f8a --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/profile_allreduce_sp.sh @@ -0,0 +1,116 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5 +export NCCL_IB_DISABLE=0 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1024 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 512 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 256 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 128 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 64 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 32 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 16 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 8 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 4 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 2 +sleep 1 +echo "Running: python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1" +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_allreduce_sp ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 1 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/sp_time_1nodes_8gpus_per_node.json --local_batch_size 1 +sleep 1 +rm -r ./profiler_log \ No newline at end of file diff --git a/llm/auto_parallel/galvatron/scripts/profile_computation.sh b/llm/auto_parallel/galvatron/scripts/profile_computation.sh new file mode 100644 index 000000000000..a0e291ab9ee8 --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/profile_computation.sh @@ -0,0 +1,160 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +unset CUDA_VISIBLE_DEVICES + +unset PADDLE_ELASTIC_JOB_ID +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +unset FLAGS_START_PORT +unset PADDLE_ELASTIC_TIMEOUT +unset PADDLE_TRAINERS_NUM +unset PADDLE_TRAINER_ID +unset PADDLE_WORKERS_IP_PORT_LIST +unset PADDLE_TRAINERS +unset PADDLE_NUM_GRADIENT_SERVERS + +source + +task_name="qwen" +dir_name="profile_computation" +rm -rf output/$dir_name/$task_name/ +rm -rf "output/$dir_name/$task_name""_log" + +export SOT_LOG_LEVEL=4 +export PYTHONPATH=../../../:$PYTHONPATH + +TRAINER="./train_qwen.py" +LAUNCHER="python -u -m paddle.distributed.launch" +LAUNCHER="${LAUNCHER} --gpus 7" # 设置需要使用的GPU +LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output"" + +export LAUNCHER=$LAUNCHER + +# [max_steps] [logging_steps] [enable_auto_parallel] +TRAIN_ARGS=" + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --max_grad_norm 1.0 \ + --learning_rate 3e-05 \ + --min_learning_rate 3e-06 \ + --max_steps 25 \ + --logging_steps 1 \ + --continue_training 0 \ + --do_train true \ + --do_eval false \ + --do_predict false \ + --disable_tqdm true \ + --skip_profile_timer false \ + --skip_memory_metrics 0 \ + --save_total_limit 2 \ + --device gpu \ + --dataloader_num_workers 1 \ + --distributed_dataloader 0 \ + --enable_auto_parallel 1 \ +" + +# [seq_length] [num_hidden_layers] +MODEL_ARGS=" + --model_name_or_path "llama" \ + --tokenizer_name_or_path "llama" \ + --num_hidden_layers 2 \ + --intermediate_size 25600 \ + --vocab_size 32000 \ + --hidden_size 5120 \ + --seq_length 1024 \ + --num_attention_heads 64 \ + --num_key_value_heads 8 \ +" + +# [mbsz, accumulation_steps] [recompute] [amp] +CONFIG_ARGS=" + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --recompute true \ + --recompute_use_reentrant true \ + --recompute_granularity full \ + --pp_recompute_interval 0 \ + --bf16 true \ + --fp16_opt_level "O2" \ + --amp_master_grad true \ + --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ + --amp_custom_white_list "lookup_table" "lookup_table_v2" \ +" + +# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs] +PARALLEL_ARGS=( + --to_static 0 + --sharding_parallel_degree 1 + --sharding "stage2" + --tensor_parallel_degree 2 + --sequence_parallel true + --pipeline_parallel_degree 2 + --virtual_pp_degree 1 + --pipeline_schedule_mode "1F1B" + --sep_parallel_degree 1 + --pipeline_parallel_config "enable_send_recv_overlap" + --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" + --sharding_parallel_config "enable_overlap enable_release_grads" + --tensor_parallel_config "enable_mp_async_allreduce replace_with_parallel_cross_entropy" +) + +# [fused] [flash_attention] +DEFAULT_OPTIMIZER=" + --fuse_attention_ffn true \ + --fuse_attention_qkv true \ + --fused_linear_param_grad_add 1 \ + --fuse_sequence_parallel_allreduce true \ + --use_flash_attention true \ + --use_fused_rope true \ + --use_fused_rms_norm false \ + --enable_linear_fused_grad_add true \ +" + +# [data] +DATA_ARGS=" + --input_dir ./data \ + --split 949,50,1 \ + --max_seq_length 16384" + +# [runtime profiler] +RUNTIME_PROFILE_ARGS=" + --profile_time_flag 1 \ + --profile_forward_only 1 \ + --save_time_flag 1 \ +" + +# [model profiler] [sequence type] +MODEL_PROFILER_ARGS=" + --profile_type computation \ + --profile_mode sequence \ + --profile_fixed_batch_size 1 \ + --layernum_min 1 \ + --layernum_max 2 \ + --profile_min_seq_length 4096 \ + --profile_max_seq_length 16384 \ + --profile_seq_length_step 4096 \ + --num_layertype 1 \ +" + +python ./profile.py \ + $MODEL_ARGS \ + $TRAIN_ARGS \ + $CONFIG_ARGS \ + "${PARALLEL_ARGS[@]}" \ + $DEFAULT_OPTIMIZER \ + $DATA_ARGS \ + $RUNTIME_PROFILE_ARGS \ + $MODEL_PROFILER_ARGS \ No newline at end of file diff --git a/llm/auto_parallel/galvatron/scripts/profile_hardware.sh b/llm/auto_parallel/galvatron/scripts/profile_hardware.sh new file mode 100644 index 000000000000..865f7c812adb --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/profile_hardware.sh @@ -0,0 +1,32 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +launch="${interpreter} -u -m paddle.distributed.launch" +launch="${launch} --master $master:$port --nnodes $nnodes --rank $rank --gpus 0,1,2,3,4,5,6,7" + +export INTERPRETER=${interpreter} +export LAUNCHER=${launch} +export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5 +export NCCL_IB_DISABLE=0 + +PROFILE_HARDWARE_ARGS=( + --num_nodes $nnodes + --num_gpus_per_node 8 + --backend 'paddle' + --max_pp_deg 8 + --max_tp_deg 8 +) + +${interpreter} profile_hardware.py \ + "${PROFILE_HARDWARE_ARGS[@]}" \ No newline at end of file diff --git a/llm/auto_parallel/galvatron/scripts/profile_memory.sh b/llm/auto_parallel/galvatron/scripts/profile_memory.sh new file mode 100644 index 000000000000..2aa615f7eb9c --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/profile_memory.sh @@ -0,0 +1,161 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +unset CUDA_VISIBLE_DEVICES + +unset PADDLE_ELASTIC_JOB_ID +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +unset FLAGS_START_PORT +unset PADDLE_ELASTIC_TIMEOUT +unset PADDLE_TRAINERS_NUM +unset PADDLE_TRAINER_ID +unset PADDLE_WORKERS_IP_PORT_LIST +unset PADDLE_TRAINERS +unset PADDLE_NUM_GRADIENT_SERVERS + +source + +task_name="qwen_profile_memory" +dir_name="profile_memory" +rm -rf output/$dir_name/$task_name/ +rm -rf "output/$dir_name/$task_name""_log" + +export SOT_LOG_LEVEL=4 +export PYTHONPATH=../../../:$PYTHONPATH + +TRAINER="./train_qwen.py" +LAUNCHER="python -u -m paddle.distributed.launch" +LAUNCHER="${LAUNCHER} --gpus 0,1,2,3,4,5,6,7" # 设置需要使用的GPU +LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output"" + +export LAUNCHER=$LAUNCHER +export PROFILE_WORLD_SIZE=8 + +# [max_steps] [logging_steps] [enable_auto_parallel] +TRAIN_ARGS=" + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --max_grad_norm 1.0 \ + --learning_rate 3e-05 \ + --min_learning_rate 3e-06 \ + --max_steps 10 \ + --logging_steps 1 \ + --continue_training 0 \ + --do_train true \ + --do_eval false \ + --do_predict false \ + --disable_tqdm true \ + --skip_profile_timer false \ + --skip_memory_metrics 0 \ + --save_total_limit 2 \ + --device gpu \ + --dataloader_num_workers 1 \ + --distributed_dataloader 0 \ + --enable_auto_parallel 1 \ +" + +# [seq_length] [num_hidden_layers] +MODEL_ARGS=" + --model_name_or_path "llama" \ + --tokenizer_name_or_path "llama" \ + --num_hidden_layers 2 \ + --intermediate_size 25600 \ + --vocab_size 32000 \ + --hidden_size 5120 \ + --seq_length 1024 \ + --num_attention_heads 64 \ + --num_key_value_heads 8 \ +" + +# [mbsz, accumulation_steps] [recompute] [amp] +CONFIG_ARGS=" + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 1 \ + --recompute false \ + --recompute_use_reentrant true \ + --recompute_granularity full \ + --pp_recompute_interval 0 \ + --bf16 true \ + --fp16_opt_level "O2" \ + --amp_master_grad true \ + --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ + --amp_custom_white_list "lookup_table" "lookup_table_v2" \ +" + +# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs] +PARALLEL_ARGS=( + --to_static 0 + --sharding_parallel_degree 1 + --sharding "stage2" + --tensor_parallel_degree 2 + --sequence_parallel true + --pipeline_parallel_degree 2 + --virtual_pp_degree 1 + --pipeline_schedule_mode "1F1B" + --sep_parallel_degree 1 + --pipeline_parallel_config "enable_send_recv_overlap" + --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" + --sharding_parallel_config "enable_overlap" + --tensor_parallel_config "enable_mp_async_allreduce" +) + +# [fused] [flash_attention] +DEFAULT_OPTIMIZER_ARGS=" + --fuse_attention_ffn true \ + --fuse_attention_qkv true \ + --fused_linear_param_grad_add 1 \ + --fuse_sequence_parallel_allreduce true \ + --use_flash_attention true \ + --use_fused_rope true \ + --use_fused_rms_norm true \ + --enable_linear_fused_grad_add true \ +" + +# [data] +DATA_ARGS=" + --input_dir ./data \ + --split 949,50,1 \ + --max_seq_length 16384" + +# [runtime profiler] +RUNTIME_PROFILE_ARGS=" + --profile_memory_flag 1 \ + --save_memory_flag 1 \ +" + +# [model profiler] [static type] +MODEL_PROFILER_ARGS=" + --profile_type memory \ + --profile_mode static \ + --profile_fixed_batch_size 8 \ + --layernum_min 1 \ + --layernum_max 2 \ + --profile_fixed_seq_length_list 16384 \ + --num_layertype 1 \ + --max_tp_deg 8 \ + --max_per_device_train_batch_size 4 \ +" + +python ./profile.py \ + $MODEL_ARGS \ + $TRAIN_ARGS \ + $CONFIG_ARGS \ + "${PARALLEL_ARGS[@]}" \ + $DEFAULT_OPTIMIZER_ARGS \ + $DATA_ARGS \ + $RUNTIME_PROFILE_ARGS \ + $MODEL_PROFILER_ARGS + \ No newline at end of file diff --git a/llm/auto_parallel/galvatron/scripts/profile_overlap.sh b/llm/auto_parallel/galvatron/scripts/profile_overlap.sh new file mode 100644 index 000000000000..4acc2133bece --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/profile_overlap.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5 +export NCCL_IB_DISABLE=0 +echo "Running python3 -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 --log_dir output/profile_overlap ./paddlenlp/experimental/galvatron/profiler/profile_overlap.py --output_dir "./output"" +python3 -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 --log_dir output/profile_overlap ./paddlenlp/experimental/galvatron/profiler/profile_overlap.py --output_dir "./output" +sleep 1 +rm -r ./profiler_log \ No newline at end of file diff --git a/llm/auto_parallel/galvatron/scripts/profile_p2p.sh b/llm/auto_parallel/galvatron/scripts/profile_p2p.sh new file mode 100644 index 000000000000..b8fa0c79f062 --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/profile_p2p.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5 +export NCCL_IB_DISABLE=0 +echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json " +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json +sleep 1 +echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json " +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json +sleep 1 +echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json " +python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir ./output/profile_p2p ./paddlenlp/experimental/galvatron/profiler/profile_p2p.py --output_dir "./output" --pp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/p2p_bandwidth_1nodes_8gpus_per_node.json +sleep 1 +rm -r ./profiler_log \ No newline at end of file diff --git a/llm/auto_parallel/galvatron/scripts/search_dist.sh b/llm/auto_parallel/galvatron/scripts/search_dist.sh new file mode 100644 index 000000000000..80d75b2aa228 --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/search_dist.sh @@ -0,0 +1,66 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +unset PADDLE_ELASTIC_JOB_ID +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +unset FLAGS_START_PORT +unset PADDLE_ELASTIC_TIMEOUT +unset PADDLE_TRAINERS_NUM +unset PADDLE_TRAINER_ID +unset PADDLE_WORKERS_IP_PORT_LIST +unset PADDLE_TRAINERS +unset PADDLE_NUM_GRADIENT_SERVERS + +source + +export PYTHONPATH=../../..:$PYTHONPATH + +ProfileDataParserArgs=" + --time_profile_mode sequence \ + --memory_profile_mode static \ + --num_layertype 1 \ + --hidden_size_list 5120 \ + --layernum_list 72 \ + --seqlen_list 32768 \ + --profile_gpu_num 64 \ + --time_profile_data_path ./configs/computation_profiling_bf16_llama_rank[0].json \ + --memory_profile_data_path ./configs/memory_profiling_bf16_llama.json \ + --overlap_coe_path ./configs/overlap_coefficient.json \ + --allreduce_coe_path ./configs/allreduce_bandwidth_8nodes_8gpus_per_node.json \ + --p2p_coe_path ./configs/p2p_bandwidth_8nodes_8gpus_per_node.json \ + --sp_time_path ./configs/sp_time_8nodes_8gpus_per_node.json \ +" + +SearchEngineArgs=" + --search_granularity fine-grained \ + --world_size 64 \ + --min_bsz 64 \ + --max_bsz 64 \ + --bsz_step 1 \ + --max_tp_size 8 \ + --max_pp_size 8 \ + --mixed_precision_type bf16 \ + --memory_upper_limit 95 \ + --sp_space tp \ + --layernum 72 \ + --disable_sdp 0 \ + --disable_vtp 0 \ + --parallel_search 0 \ + --log_dir ./search-engine-logs \ +" + +python ./search_dist.py ${ProfileDataParserArgs} ${SearchEngineArgs} \ No newline at end of file diff --git a/llm/auto_parallel/galvatron/scripts/train_qwen.sh b/llm/auto_parallel/galvatron/scripts/train_qwen.sh new file mode 100644 index 000000000000..de2e20522a67 --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/train_qwen.sh @@ -0,0 +1,143 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +unset CUDA_VISIBLE_DEVICES + +task_name="corase_grained" +dir_name="fine-vs-corase" + +export HF_ENDPOINT=https://hf-mirror.com +rm -rf output/$dir_name/$task_name/ +rm -rf "output/$dir_name/$task_name""_log" + +export SOT_LOG_LEVEL=4 +export PYTHONPATH=../../../:$PYTHONPATH + +TRAINER="./train_qwen.py" +LAUNCHER="python -u -m paddle.distributed.launch --log_level DEBUG" +LAUNCHER="${LAUNCHER} --gpus 0,1,2,3,4,5,6,7" +LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output"" + +# [max_steps] [logging_steps] [enable_auto_parallel] +TRAIN_ARGS=" + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --max_grad_norm 1.0 \ + --learning_rate 3e-05 \ + --min_learning_rate 3e-06 \ + --max_steps 25 \ + --logging_steps 1 \ + --continue_training 0 \ + --do_train true \ + --disable_tqdm true \ + --skip_profile_timer false \ + --skip_memory_metrics 0 \ + --save_total_limit 2 \ + --device gpu \ + --dataloader_num_workers 1 \ + --distributed_dataloader 0 \ + --enable_auto_parallel 1 \ +" + +# [seq_length] [num_hidden_layers] +# still need to use llama as model_type +MODEL_ARGS=( + --model_type "llama" + --num_hidden_layers 16 + --intermediate_size 11008 + --vocab_size 32000 + --hidden_size 4096 + --seq_length 1024 + --num_attention_heads 32 + --num_key_value_heads 32 +) + +# "max_position_embeddings": 32768, +# [mbsz, accumulation_steps] [recompute] [amp] +CONFIG_ARGS=" + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 32 \ + --recompute false \ + --recompute_use_reentrant true \ + --recompute_granularity full \ + --pp_recompute_interval 0 \ + --bf16 true \ + --fp16_opt_level "O2" \ + --amp_master_grad true \ + --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ + --amp_custom_white_list "lookup_table" "lookup_table_v2" \ +" + +# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs] +PARALLEL_ARGS=( + --to_static 1 + --sharding_parallel_degree 2 + --sharding "stage3" + --tensor_parallel_degree 4 + --sequence_parallel true + --pipeline_parallel_degree 1 + --virtual_pp_degree 1 + --pipeline_schedule_mode "1F1B" + --sep_parallel_degree 1 + --pipeline_parallel_config "enable_send_recv_overlap" + --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" + --sharding_parallel_config "enable_overlap enable_release_grads" + --tensor_parallel_config "enable_mp_async_allreduce replace_with_parallel_cross_entropy" +) +# --sharding_parallel_config "enable_overlap enable_release_grads enable_tensor_fusion" + + +# [fused] [flash_attention] +DEFAULT_OPTIMIZER_ARGS=" + --fuse_attention_ffn true \ + --fuse_attention_qkv true \ + --fused_linear_param_grad_add 1 \ + --fuse_sequence_parallel_allreduce true \ + --use_flash_attention true \ + --use_fused_rope true \ + --use_fused_rms_norm false \ + --enable_linear_fused_grad_add true \ +" + + # --use_fast_layer_norm true \ +# [data] max_seq_length equal config.max_position_embeddings +DATA_ARGS=" + --input_dir ./data \ + --split 949,50,1 \ + --max_seq_length 1024" + +# [runtime_profile] +RUNTIME_PROFILE_ARGS=" + --profile_time_flag 1 \ + --profile_memory_flag 1 \ + --profile_forward_only 0 \ + --save_time_flag 0 \ + --save_memory_flag 0 \ +" + +# [debug] +DEBUG_ARGS=" + --job_schedule_profiler_start 1 \ + --job_schedule_profiler_end 5 \ +" + +$LAUNCHER \ + "${MODEL_ARGS[@]}" \ + $TRAIN_ARGS \ + $CONFIG_ARGS \ + "${PARALLEL_ARGS[@]}" \ + $DEFAULT_OPTIMIZER_ARGS \ + $DATA_ARGS \ + $RUNTIME_PROFILE_ARGS \ \ No newline at end of file diff --git a/llm/auto_parallel/galvatron/scripts/train_qwen_fine_graine.sh b/llm/auto_parallel/galvatron/scripts/train_qwen_fine_graine.sh new file mode 100644 index 000000000000..7f616c59783a --- /dev/null +++ b/llm/auto_parallel/galvatron/scripts/train_qwen_fine_graine.sh @@ -0,0 +1,185 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +unset CUDA_VISIBLE_DEVICES + +nnodes=$PADDLE_TRAINERS_NUM +rank=$PADDLE_TRAINER_ID + +unset PADDLE_ELASTIC_JOB_ID +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +unset FLAGS_START_PORT +unset PADDLE_ELASTIC_TIMEOUT +unset PADDLE_TRAINERS_NUM +unset PADDLE_TRAINER_ID +unset PADDLE_WORKERS_IP_PORT_LIST +unset PADDLE_TRAINERS +unset PADDLE_NUM_GRADIENT_SERVERS + +START_RANK=0 +END_RANK=8 + +if [[ $rank -lt $START_RANK ]]; then + exit 0 +fi + +if [[ $rank -ge $END_RANK ]]; then + exit 0 +fi +export rank=$(($rank-$START_RANK)) +export nnodes=$(($END_RANK-$START_RANK)) +master_ip=`cat /root/paddlejob/workspace/hostfile | head -n $(($START_RANK+1)) | tail -n 1 | awk '{print $1}'` +export master=$master_ip +export port=36677 + +export interpreter="" + +task_name="fine_grained_config-with-manual" +dir_name="fine-vs-corase" + +rm -rf output/$dir_name/$task_name/ +rm -rf "output/$dir_name/$task_name""_log" + +export SOT_LOG_LEVEL=4 +export PYTHONPATH=../../../:$PYTHONPATH + +TRAINER="./train_qwen_fine_graine.py" +LAUNCHER="python -u -m paddle.distributed.launch" +LAUNCHER="${LAUNCHER} --master $master:$port --nnodes $nnodes --rank $rank --gpus 0,1,2,3,4,5,6,7" +LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output"" + +# [max_steps] [logging_steps] [enable_auto_parallel] +TRAIN_ARGS=" + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --max_grad_norm 1.0 \ + --learning_rate 3e-05 \ + --min_learning_rate 3e-06 \ + --max_steps 25 \ + --logging_steps 1 \ + --continue_training 0 \ + --do_train true \ + --disable_tqdm true \ + --skip_profile_timer false \ + --skip_memory_metrics 0 \ + --save_total_limit 2 \ + --device gpu \ + --dataloader_num_workers 1 \ + --distributed_dataloader 0 \ + --enable_auto_parallel 1 \ +" + +# [seq_length] [num_hidden_layers] +# still need to use llama as model_type +MODEL_ARGS=( + --model_type "llama_fine_grained_final" + --num_hidden_layers 72 + --intermediate_size 25600 + --vocab_size 32000 + --hidden_size 5120 + --seq_length 32768 + --num_attention_heads 64 + --num_key_value_heads 8 +) + +# "max_position_embeddings": 32768, +# [mbsz, accumulation_steps] [recompute] [amp] +CONFIG_ARGS=" + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 32 \ + --recompute false \ + --recompute_use_reentrant true \ + --recompute_granularity full \ + --pp_recompute_interval 0 \ + --bf16 true \ + --fp16_opt_level "O2" \ + --amp_master_grad true \ + --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ + --amp_custom_white_list "lookup_table" "lookup_table_v2" \ +" + +# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs] +PARALLEL_ARGS=( + --to_static 1 + --sharding_parallel_degree 2 + --sharding "stage2" + --tensor_parallel_degree 4 + --sequence_parallel true + --pipeline_parallel_degree 1 + --virtual_pp_degree 1 + --pipeline_schedule_mode "1F1B" + --sep_parallel_degree 1 + --pipeline_parallel_config "enable_send_recv_overlap" + --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" + --sharding_parallel_config "enable_overlap enable_release_grads" + --tensor_parallel_config "enable_mp_async_allreduce replace_with_parallel_cross_entropy" +) +# --sharding_parallel_config "enable_overlap enable_release_grads enable_tensor_fusion" + + +# [fused] [flash_attention] +DEFAULT_OPTIMIZER_ARGS=" + --fuse_attention_ffn true \ + --fuse_attention_qkv true \ + --fused_linear_param_grad_add 1 \ + --fuse_sequence_parallel_allreduce true \ + --use_flash_attention true \ + --use_fused_rope true \ + --use_fused_rms_norm true \ + --enable_linear_fused_grad_add true \ +" + +# [data] max_seq_length equal config.max_position_embeddings +DATA_ARGS=" + --input_dir ./data \ + --split 949,50,1 \ + --max_seq_length 32768" + +# [runtime_profile] +RUNTIME_PROFILE_ARGS=" + --profile_time_flag 1 \ + --profile_memory_flag 1 \ + --profile_forward_only 0 \ + --save_time_flag 0 \ + --save_memory_flag 0 \ +" + +# [debug] +DEBUG_ARGS=" + --job_schedule_profiler_start 1 \ + --job_schedule_profiler_end 5 \ +" + +# [GranularityRuntime] +GRANULARITY_RUNTIME_ARGS=" + --granularity_type fine_grained \ + --usp_flag 0 \ + --sharding_stage_level 2 \ + --fine_grained_config_path ./configs/fine_grained_config.json \ +" + +bash kill.sh +sleep 1 + +$LAUNCHER \ + "${MODEL_ARGS[@]}" \ + $TRAIN_ARGS \ + $CONFIG_ARGS \ + "${PARALLEL_ARGS[@]}" \ + $DEFAULT_OPTIMIZER_ARGS \ + $DATA_ARGS \ + $RUNTIME_PROFILE_ARGS \ + $GRANULARITY_RUNTIME_ARGS \ No newline at end of file