deepmodeling
diff --git a/‎source/lib/include/gpu_cuda.h‎
Lines changed: 1 addition & 0 deletions b/‎source/lib/include/gpu_cuda.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎source/lib/include/gpu_rocm.h‎
Lines changed: 1 addition & 0 deletions b/‎source/lib/include/gpu_rocm.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎source/op/gelu_multi_device.cc‎
Lines changed: 6 additions & 18 deletions b/‎source/op/gelu_multi_device.cc‎
Lines changed: 6 additions & 18 deletions
@@ -13,6 +13,7 @@
 #define gpuMemcpy cudaMemcpy
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemset cudaMemset
 
 #define GPU_MAX_NBOR_SIZE 4096
 
@@ -16,6 +16,7 @@
 #define gpuMemcpy hipMemcpy
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemset hipMemset
 
 #define DPErrcheck(res) \
 
@@ -64,13 +64,9 @@ class GeluOp : public OpKernel {
     const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::gelu_gpu(out, x, size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::gelu_gpu(out, x, size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_cpu(out, x, size);
     }
@@ -108,13 +104,9 @@ class GeluGradOp : public OpKernel {
     const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::gelu_grad_gpu(out, x, dy, size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::gelu_grad_gpu(out, x, dy, size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_grad_cpu(out, x, dy, size);
     }
@@ -154,13 +146,9 @@ class GeluGradGradOp : public OpKernel {
     const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_grad_grad_cpu(out, x, dy, dy_2, size);
     }