Skip to content

Commit 607eec3

Browse files
authored
Merge pull request #8946 from kexinzhao/fix_cuda_arch_fp16
Add GPU compute capability check for float16 math function test
2 parents b5ef315 + c88f58d commit 607eec3

File tree

6 files changed

+50
-0
lines changed

6 files changed

+50
-0
lines changed

paddle/fluid/operators/math/math_function.cu

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ void gemm<platform::CUDADeviceContext, float16>(
4545
const half* h_B = reinterpret_cast<const half*>(B);
4646
half* h_C = reinterpret_cast<half*>(C);
4747

48+
// TODO(kexinzhao): add processing code for compute capability < 53 case
49+
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
50+
"cublas Hgemm requires GPU compute capability >= 53");
4851
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
4952
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
5053
h_A, lda, &h_beta, h_C, N));
@@ -106,6 +109,9 @@ void gemm<platform::CUDADeviceContext, float16>(
106109
const half* h_B = reinterpret_cast<const half*>(B);
107110
half* h_C = reinterpret_cast<half*>(C);
108111

112+
// TODO(kexinzhao): add processing code for compute capability < 53 case
113+
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
114+
"cublas Hgemm requires GPU compute capability >= 53");
109115
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
110116
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
111117
h_A, lda, &h_beta, h_C, ldc));
@@ -251,6 +257,9 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
251257
const half* h_B = reinterpret_cast<const half*>(B);
252258
half* h_C = reinterpret_cast<half*>(C);
253259

260+
// TODO(kexinzhao): add processing code for compute capability < 53 case
261+
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
262+
"cublas Hgemm requires GPU compute capability >= 53");
254263
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
255264
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
256265
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));

paddle/fluid/operators/math/math_function_test.cu

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ TEST(math_function, notrans_mul_trans_fp16) {
7272
CUDAPlace gpu_place(0);
7373
CUDADeviceContext context(gpu_place);
7474

75+
// fp16 GEMM in cublas requires GPU compute capability >= 53
76+
if (context.GetComputeCapability() < 53) {
77+
return;
78+
}
79+
7580
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
7681
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
7782

@@ -149,6 +154,11 @@ TEST(math_function, trans_mul_notrans_fp16) {
149154
CUDAPlace gpu_place(0);
150155
CUDADeviceContext context(gpu_place);
151156

157+
// fp16 GEMM in cublas requires GPU compute capability >= 53
158+
if (context.GetComputeCapability() < 53) {
159+
return;
160+
}
161+
152162
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
153163
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
154164

@@ -248,6 +258,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
248258
CUDAPlace gpu_place(0);
249259
CUDADeviceContext context(gpu_place);
250260

261+
// fp16 GEMM in cublas requires GPU compute capability >= 53
262+
if (context.GetComputeCapability() < 53) {
263+
return;
264+
}
265+
251266
int m = 2;
252267
int n = 3;
253268
int k = 3;
@@ -355,6 +370,11 @@ TEST(math_function, gemm_trans_cublas_fp16) {
355370
CUDAPlace gpu_place(0);
356371
CUDADeviceContext context(gpu_place);
357372

373+
// fp16 GEMM in cublas requires GPU compute capability >= 53
374+
if (context.GetComputeCapability() < 53) {
375+
return;
376+
}
377+
358378
int m = 2;
359379
int n = 3;
360380
int k = 3;

paddle/fluid/platform/device_context.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
127127

128128
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
129129
SetDeviceId(place_.device);
130+
compute_capability = GetCUDAComputeCapability(place_.device);
130131
multi_process = GetCUDAMultiProcessors(place_.device);
131132
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
132133
PADDLE_ENFORCE(cudaStreamCreate(&stream_));
@@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const {
162163
PADDLE_ENFORCE(cudaGetLastError());
163164
}
164165

166+
int CUDADeviceContext::GetComputeCapability() const {
167+
return compute_capability;
168+
}
169+
165170
int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
166171
return multi_process * max_threads_per_mp;
167172
}

paddle/fluid/platform/device_context.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
7979
/*! \brief Return place in the device context. */
8080
Place GetPlace() const override;
8181

82+
/*! \brief Return compute capability in the device context. */
83+
int GetComputeCapability() const;
84+
8285
/*! \brief Return the max physical thread count in the device context */
8386
int GetMaxPhysicalThreadCount() const;
8487

@@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext {
104107
cudnnHandle_t cudnn_handle_;
105108
cublasHandle_t cublas_handle_;
106109

110+
int compute_capability;
107111
int multi_process;
108112
int max_threads_per_mp;
109113
};

paddle/fluid/platform/gpu_info.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@ int GetCUDADeviceCount() {
3333
return count;
3434
}
3535

36+
int GetCUDAComputeCapability(int id) {
37+
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
38+
cudaDeviceProp device_prop;
39+
PADDLE_ENFORCE(cudaGetDeviceProperties(&device_prop, id),
40+
"cudaGetDeviceProperties failed in "
41+
"paddle::platform::GetCUDAComputeCapability");
42+
return device_prop.major * 10 + device_prop.minor;
43+
}
44+
3645
int GetCUDAMultiProcessors(int id) {
3746
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
3847
int count;

paddle/fluid/platform/gpu_info.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse =
3030
//! Get the total number of GPU devices in system.
3131
int GetCUDADeviceCount();
3232

33+
//! Get the compute capability of the ith GPU (format: major * 10 + minor)
34+
int GetCUDAComputeCapability(int i);
35+
3336
//! Get the MultiProcessors of the ith GPU.
3437
int GetCUDAMultiProcessors(int i);
3538

0 commit comments

Comments
 (0)