Merge pull request #8946 from kexinzhao/fix_cuda_arch_fp16

kexinzhao · web-flow · commit 607eec30a848 · 2018-03-12T00:27:43.000-07:00
Add GPU compute capability check for float16 math function test
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
@@ -45,6 +45,9 @@ void gemm<platform::CUDADeviceContext, float16>(
   const half* h_B = reinterpret_cast<const half*>(B);
   half* h_C = reinterpret_cast<half*>(C);
 
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
+                    "cublas Hgemm requires GPU compute capability >= 53");
   PADDLE_ENFORCE(platform::dynload::cublasHgemm(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
       h_A, lda, &h_beta, h_C, N));
@@ -106,6 +109,9 @@ void gemm<platform::CUDADeviceContext, float16>(
   const half* h_B = reinterpret_cast<const half*>(B);
   half* h_C = reinterpret_cast<half*>(C);
 
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
+                    "cublas Hgemm requires GPU compute capability >= 53");
   PADDLE_ENFORCE(platform::dynload::cublasHgemm(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
       h_A, lda, &h_beta, h_C, ldc));
@@ -251,6 +257,9 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
   const half* h_B = reinterpret_cast<const half*>(B);
   half* h_C = reinterpret_cast<half*>(C);
 
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
+                    "cublas Hgemm requires GPU compute capability >= 53");
   PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
       strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
@@ -72,6 +72,11 @@ TEST(math_function, notrans_mul_trans_fp16) {
   CUDAPlace gpu_place(0);
   CUDADeviceContext context(gpu_place);
 
+  // fp16 GEMM in cublas requires GPU compute capability >= 53
+  if (context.GetComputeCapability() < 53) {
+    return;
+  }
+
   float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
 
@@ -149,6 +154,11 @@ TEST(math_function, trans_mul_notrans_fp16) {
   CUDAPlace gpu_place(0);
   CUDADeviceContext context(gpu_place);
 
+  // fp16 GEMM in cublas requires GPU compute capability >= 53
+  if (context.GetComputeCapability() < 53) {
+    return;
+  }
+
   float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
 
@@ -248,6 +258,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   CUDAPlace gpu_place(0);
   CUDADeviceContext context(gpu_place);
 
+  // fp16 GEMM in cublas requires GPU compute capability >= 53
+  if (context.GetComputeCapability() < 53) {
+    return;
+  }
+
   int m = 2;
   int n = 3;
   int k = 3;
@@ -355,6 +370,11 @@ TEST(math_function, gemm_trans_cublas_fp16) {
   CUDAPlace gpu_place(0);
   CUDADeviceContext context(gpu_place);
 
+  // fp16 GEMM in cublas requires GPU compute capability >= 53
+  if (context.GetComputeCapability() < 53) {
+    return;
+  }
+
   int m = 2;
   int n = 3;
   int k = 3;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
@@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
 
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   SetDeviceId(place_.device);
+  compute_capability = GetCUDAComputeCapability(place_.device);
   multi_process = GetCUDAMultiProcessors(place_.device);
   max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
@@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const {
   PADDLE_ENFORCE(cudaGetLastError());
 }
 
+int CUDADeviceContext::GetComputeCapability() const {
+  return compute_capability;
+}
+
 int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
   return multi_process * max_threads_per_mp;
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return place in the device context. */
   Place GetPlace() const override;
 
+  /*! \brief  Return compute capability in the device context. */
+  int GetComputeCapability() const;
+
   /*! \brief  Return the max physical thread count in the device context */
   int GetMaxPhysicalThreadCount() const;
 
@@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext {
   cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;
 
+  int compute_capability;
   int multi_process;
   int max_threads_per_mp;
 };
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
@@ -33,6 +33,15 @@ int GetCUDADeviceCount() {
   return count;
 }
 
+int GetCUDAComputeCapability(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  cudaDeviceProp device_prop;
+  PADDLE_ENFORCE(cudaGetDeviceProperties(&device_prop, id),
+                 "cudaGetDeviceProperties failed in "
+                 "paddle::platform::GetCUDAComputeCapability");
+  return device_prop.major * 10 + device_prop.minor;
+}
+
 int GetCUDAMultiProcessors(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
@@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse =
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
 
+//! Get the compute capability of the ith GPU (format: major * 10 + minor)
+int GetCUDAComputeCapability(int i);
+
 //! Get the MultiProcessors of the ith GPU.
 int GetCUDAMultiProcessors(int i);