[cherry-pick]Add tf32 support for A100 tensor core acceleration for cuBLAS (#28732) (#30612)

AshburnLee · web-flow · commit fd9d6fdac285 · 2021-01-20T22:52:23.000+08:00
* Add tf32 support for A100 tensor core acceleration for cuBLAS (#28732) * Fixed an error * Fixed an error
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
@@ -84,8 +84,13 @@ class CublasHandleHolder {
     if (math_type == CUBLAS_TENSOR_OP_MATH) {
       PADDLE_RETRY_CUDA_SUCCESS(
           dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
+#if CUDA_VERSION >= 11000
+    } else if (math_type == CUBLAS_TF32_TENSOR_OP_MATH) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::cublasSetMathMode(handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif  // CUDA_VERSION >= 11000
     }
-#endif
+#endif  // CUDA_VERSION >= 9000
   }
 
   ~CublasHandleHolder() PADDLE_MAY_THROW {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
@@ -198,7 +198,11 @@ class CUDAContext {
   /*! \brief  Call cublas function safely. */
   template <typename Callback>
   inline void CublasCall(Callback&& callback) const {
-    cublas_handle_->Call(std::forward<Callback>(callback));
+    if (cublas_tf32_tensor_core_handle_) {
+      cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback));
+    } else {
+      cublas_handle_->Call(std::forward<Callback>(callback));
+    }
   }
 
   /*! \brief  Check whether tensor core is supported */
@@ -225,7 +229,11 @@ class CUDAContext {
 #if CUDA_VERSION >= 9000
       cublas_tensor_core_handle_.reset(
           new CublasHandleHolder(RawStream(), CUBLAS_TENSOR_OP_MATH));
-#endif
+#if CUDA_VERSION >= 11000
+      cublas_tf32_tensor_core_handle_.reset(
+          new CublasHandleHolder(RawStream(), CUBLAS_TF32_TENSOR_OP_MATH));
+#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 9000
     }
   }
 
@@ -268,6 +276,7 @@ class CUDAContext {
   void DestoryCuBlasContext() {
     cublas_handle_.reset();
     cublas_tensor_core_handle_.reset();
+    cublas_tf32_tensor_core_handle_.reset();
   }
 
   void DestoryCuSolverContext() {
@@ -284,6 +293,7 @@ class CUDAContext {
   cudnnHandle_t cudnn_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
+  std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
   cusolverDnHandle_t cusolver_dn_handle_;
   DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -58,6 +58,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
diff --git a/python/paddle/fluid/tests/unittests/test_tf32_cublas.py b/python/paddle/fluid/tests/unittests/test_tf32_cublas.py
@@ -0,0 +1,57 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import six
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+class TestTF32Switch(unittest.TestCase):
+    def test_on_off(self):
+        if core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self.assertTrue(core.get_cublas_switch())  # default
+            core.set_cublas_switch(False)
+            self.assertFalse(core.get_cublas_switch())  # turn off
+            core.set_cublas_switch(True)
+            self.assertTrue(core.get_cublas_switch())  # turn on
+
+            core.set_cublas_switch(True)  # restore the switch
+        else:
+            pass
+
+
+class TestTF32OnMatmul(unittest.TestCase):
+    def test_dygraph_without_out(self):
+        if core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            core.set_cublas_switch(False)  # turn off
+            with fluid.dygraph.guard(place):
+                input_array1 = np.random.rand(4, 12, 64, 88).astype("float32")
+                input_array2 = np.random.rand(4, 12, 88, 512).astype("float32")
+                data1 = paddle.to_tensor(input_array1)
+                data2 = paddle.to_tensor(input_array2)
+                out = paddle.matmul(data1, data2)
+                expected_result = np.matmul(input_array1, input_array2)
+            self.assertTrue(np.allclose(expected_result, out.numpy(), 1e-03))
+            core.set_cublas_switch(True)  # restore the switch
+        else:
+            pass
+
+
+if __name__ == '__main__':
+    unittest.main()