Skip to content

Commit fbe8982

Browse files
[Cherry-Pick] Fix problem of TimeOut of distributed testcases under cuda12. (#54635)
1 parent bb4f777 commit fbe8982

File tree

9 files changed

+34
-22
lines changed

9 files changed

+34
-22
lines changed

test/auto_parallel/CMakeLists.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
2424
py_test_modules(test_optimization_tuner_api MODULES
2525
test_optimization_tuner_api)
2626
set_tests_properties(test_optimization_tuner_api
27-
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
27+
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
2828
py_test_modules(test_converter MODULES test_converter)
2929
set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
3030
TIMEOUT 50)
@@ -48,10 +48,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
4848
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
4949
py_test_modules(test_pass_sharding MODULES test_pass_sharding)
5050
set_tests_properties(test_pass_sharding
51-
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
51+
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
5252
py_test_modules(test_pass_amp MODULES test_pass_amp)
5353
set_tests_properties(test_pass_amp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
54-
TIMEOUT 50)
54+
TIMEOUT 80)
5555
py_test_modules(test_amp_o2_pass MODULES test_amp_o2_pass)
5656
set_tests_properties(test_amp_o2_pass PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
5757
TIMEOUT 50)
@@ -85,11 +85,11 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
8585
py_test_modules(test_tuning_recompute MODULES test_tuning_recompute)
8686
set_tests_properties(test_tuning_recompute PROPERTIES TIMEOUT 300)
8787
py_test_modules(test_fused_linear_pass MODULES test_fused_linear_pass)
88-
set_tests_properties(test_fused_linear_pass PROPERTIES TIMEOUT 20)
88+
set_tests_properties(test_fused_linear_pass PROPERTIES TIMEOUT 40)
8989
py_test_modules(test_align_tool MODULES test_align_tool)
9090
set_tests_properties(test_align_tool PROPERTIES TIMEOUT 20)
9191
py_test_modules(test_pass_base_list MODULES test_pass_base_list)
92-
set_tests_properties(test_pass_base_list PROPERTIES TIMEOUT 20)
92+
set_tests_properties(test_pass_base_list PROPERTIES TIMEOUT 40)
9393
py_test_modules(test_fuse_adamw_pass MODULES test_fuse_adamw_pass)
9494
set_tests_properties(test_fuse_adamw_pass PROPERTIES TIMEOUT 20)
9595
py_test_modules(test_rule_based_tuner_o2 MODULES test_rule_based_tuner_o2)

test/collective/CMakeLists.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
107107
test_collective_alltoall_single_api ENVS
108108
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
109109
set_tests_properties(test_collective_alltoall_single_api
110-
PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
110+
PROPERTIES TIMEOUT "160" LABELS "RUN_TYPE=DIST")
111111
endif()
112112
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
113113
py_test_modules(
@@ -137,10 +137,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
137137
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
138138
if(${CUDA_ARCH_NAME} STREQUAL "Ampere")
139139
set_tests_properties(test_collective_broadcast_api
140-
PROPERTIES TIMEOUT "360" LABELS "RUN_TYPE=DIST")
140+
PROPERTIES TIMEOUT "500" LABELS "RUN_TYPE=DIST")
141141
else()
142142
set_tests_properties(test_collective_broadcast_api
143-
PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
143+
PROPERTIES TIMEOUT "450" LABELS "RUN_TYPE=DIST")
144144
endif()
145145
endif()
146146
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
@@ -178,7 +178,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
178178
test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api
179179
ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
180180
set_tests_properties(test_collective_isend_irecv_api
181-
PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
181+
PROPERTIES TIMEOUT "160" LABELS "RUN_TYPE=DIST")
182182
endif()
183183
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
184184
py_test_modules(
@@ -240,10 +240,10 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
240240
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
241241
if(${CUDA_ARCH_NAME} STREQUAL "Ampere")
242242
set_tests_properties(test_collective_reduce_scatter_api
243-
PROPERTIES TIMEOUT "210" LABELS "RUN_TYPE=DIST")
243+
PROPERTIES TIMEOUT "360" LABELS "RUN_TYPE=DIST")
244244
else()
245245
set_tests_properties(test_collective_reduce_scatter_api
246-
PROPERTIES TIMEOUT "150" LABELS "RUN_TYPE=DIST")
246+
PROPERTIES TIMEOUT "250" LABELS "RUN_TYPE=DIST")
247247
endif()
248248
endif()
249249
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
@@ -258,7 +258,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
258258
test_collective_scatter_api MODULES test_collective_scatter_api ENVS
259259
"http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
260260
set_tests_properties(test_collective_scatter_api
261-
PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
261+
PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
262262
endif()
263263
if((WITH_GPU OR WITH_ROCM) AND (LINUX))
264264
py_test_modules(

test/collective/fleet/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
237237
START_BASH
238238
../../legacy_test/dist_test.sh
239239
TIMEOUT
240-
"120"
240+
"160"
241241
LABELS
242242
"RUN_TYPE=DIST"
243243
ENVS
@@ -682,13 +682,13 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
682682
START_BASH
683683
../../legacy_test/dist_test.sh
684684
TIMEOUT
685-
"120"
685+
"240"
686686
LABELS
687687
"RUN_TYPE=DIST"
688688
ENVS
689689
"PADDLE_DIST_UT_PORT=21272;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
690690
)
691-
set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT "120")
691+
set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT "240")
692692
endif()
693693
if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
694694
bash_test_modules(
@@ -922,7 +922,7 @@ if((WITH_GPU) AND (LINUX))
922922
test_dygraph_dist_save_load MODULES test_dygraph_dist_save_load ENVS
923923
"http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
924924
set_tests_properties(test_dygraph_dist_save_load
925-
PROPERTIES TIMEOUT "200" LABELS "RUN_TYPE=DIST")
925+
PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
926926
endif()
927927
if((WITH_GPU) AND (LINUX))
928928
py_test_modules(

test/collective/fleet/dygraph_group_sharded_stage3.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,10 @@ def test_stage2_stage3():
366366
# bfp16
367367
nccl_version = core.nccl_version()
368368

369-
if nccl_version >= 21000:
369+
if (
370+
nccl_version >= 21000
371+
and paddle.device.cuda.get_device_properties().major >= 8
372+
):
370373
stage2_params = train_mlp(
371374
mlp11,
372375
sharding_stage=2,

test/collective/fleet/dygraph_group_sharded_stage3_offload.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,10 @@ def test_stage3_offload():
215215

216216
# bfp16 offload
217217
nccl_version = core.nccl_version()
218-
if nccl_version >= 21000:
218+
if (
219+
nccl_version >= 21000
220+
and paddle.device.cuda.get_device_properties().major >= 8
221+
):
219222
stage3_params = train_mlp(mlp7, use_pure_fp16=True, use_bfp16=True)
220223
stage3_params_offload = train_mlp(
221224
mlp8, use_pure_fp16=True, offload=True, use_bfp16=True

test/collective/fleet/hybrid_parallel_mp_bf16.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,8 @@ def train_batch(self, batch, model, optimizer, is_mp):
6060

6161

6262
if __name__ == "__main__":
63-
if check_nccl_version_for_bf16():
63+
if (
64+
check_nccl_version_for_bf16()
65+
and paddle.device.cuda.get_device_properties().major >= 8
66+
):
6467
unittest.main()

test/collective/fleet/hybrid_parallel_pp_bf16.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,5 +165,8 @@ def test_pp_model(self):
165165

166166

167167
if __name__ == "__main__":
168-
if check_nccl_version_for_bf16():
168+
if (
169+
check_nccl_version_for_bf16()
170+
and paddle.device.cuda.get_device_properties().major >= 8
171+
):
169172
unittest.main()

test/distributed_passes/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,6 @@ endif()
2929
foreach(TEST_OP ${TEST_OPS})
3030
py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS "NVIDIA_TF32_OVERRIDE=0")
3131
list(APPEND DIST_TEST_OPS ${TEST_OP})
32-
set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)
32+
set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 250)
3333
set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
3434
endforeach()

test/legacy_test/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1304,4 +1304,4 @@ set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500)
13041304
set_tests_properties(test_sync_batch_norm_op_static_build
13051305
PROPERTIES LABELS "RUN_TYPE=DIST")
13061306
set_tests_properties(test_sync_batch_norm_op_static_build PROPERTIES TIMEOUT
1307-
120)
1307+
250)

0 commit comments

Comments
 (0)