Skip to content

Commit 8293aa1

Browse files
authored
Exclude TRT provider in tests crashed in A100 (#19972)
TensorRT EP segmentation fault on A100 for some tests. Exclude TRT EP in those tests on A100 to unblock developing. ### Motivation and Context #19530
1 parent d4c8bc3 commit 8293aa1

File tree

9 files changed

+178
-101
lines changed

9 files changed

+178
-101
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#ifdef USE_CUDA
5+
#include "cuda_runtime_api.h"
6+
#endif
7+
8+
namespace onnxruntime {
9+
namespace test {
10+
11+
int GetCudaArchitecture() {
12+
// This will cache the result so we only call cudaGetDeviceProperties once.
13+
// Usually, we test on a single GPU or multiple GPUs of same architecture, so it's fine to cache the result.
14+
static int cuda_arch = -1;
15+
16+
#ifdef USE_CUDA
17+
if (cuda_arch == -1) {
18+
int current_device_id = 0;
19+
cudaGetDevice(&current_device_id);
20+
// must wait GPU idle, otherwise cudaGetDeviceProperties might fail
21+
cudaDeviceSynchronize();
22+
cudaDeviceProp prop;
23+
24+
// When cudaGetDeviceProperties fails, just return -1 and no error is raised.
25+
// If cuda device has issue, test will fail anyway so no need to raise error here.
26+
if (cudaSuccess == cudaGetDeviceProperties(&prop, current_device_id)) {
27+
cuda_arch = prop.major * 100 + prop.minor * 10;
28+
}
29+
}
30+
#endif
31+
32+
return cuda_arch;
33+
}
34+
35+
} // namespace test
36+
} // namespace onnxruntime

onnxruntime/test/common/cuda_op_test_utils.h

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,37 +4,20 @@
44
#pragma once
55

66
#include "test/util/include/default_providers.h"
7-
#ifdef USE_CUDA
8-
#include "cuda_runtime_api.h"
9-
#endif
107

118
namespace onnxruntime {
129
namespace test {
1310

11+
// CUDA architecture of the current device like 100 * major + 10 * minor.
12+
// Please call this function after CUDA EP is enabled.
13+
int GetCudaArchitecture();
14+
1415
inline bool HasCudaEnvironment(int min_cuda_architecture) {
1516
if (DefaultCudaExecutionProvider().get() == nullptr) {
1617
return false;
1718
}
1819

19-
if (min_cuda_architecture == 0) {
20-
return true;
21-
}
22-
23-
int cuda_architecture = 0;
24-
25-
#ifdef USE_CUDA
26-
int currentCudaDevice = 0;
27-
cudaGetDevice(&currentCudaDevice);
28-
cudaDeviceSynchronize();
29-
cudaDeviceProp prop;
30-
if (cudaSuccess != cudaGetDeviceProperties(&prop, currentCudaDevice)) {
31-
return false;
32-
}
33-
34-
cuda_architecture = prop.major * 100 + prop.minor * 10;
35-
#endif
36-
37-
return cuda_architecture >= min_cuda_architecture;
20+
return GetCudaArchitecture() >= min_cuda_architecture;
3821
}
3922

4023
inline bool NeedSkipIfCudaArchLowerThan(int min_cuda_architecture) {
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
6+
#include "test/common/cuda_op_test_utils.h"
7+
8+
namespace onnxruntime {
9+
namespace test {
10+
11+
// TensorRT EP Segmentation fault on A100: https://github.com/microsoft/onnxruntime/issues/19530
12+
inline const std::unordered_set<std::string> ExcludeTrtOnA100() {
13+
// Note: GetCudaArchitecture need USE_CUDA to be defined. Currently, it is defined when TRT EP is enabled.
14+
// If we want to make TRT EP independent of CUDA EP, we need to change the implementation of GetCudaArchitecture.
15+
if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
16+
return {kTensorrtExecutionProvider};
17+
}
18+
19+
return {};
20+
}
21+
22+
// Add TensorRT EP to an excluded provider list when running on A100
23+
inline const std::unordered_set<std::string>& ExcludeTrtOnA100(std::unordered_set<std::string>& excluded_providers) {
24+
if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
25+
excluded_providers.insert(kTensorrtExecutionProvider);
26+
return excluded_providers;
27+
}
28+
29+
return excluded_providers;
30+
}
31+
32+
} // namespace test
33+
} // namespace onnxruntime

0 commit comments

Comments
 (0)