Skip to content

Commit 60634f9

Browse files
authored
Merge pull request #215 from denghuilu/devel
add customized op for gelu activation function
2 parents 10e5780 + 7b0c12e commit 60634f9

File tree

10 files changed

+480
-25
lines changed

10 files changed

+480
-25
lines changed

source/CMakeLists.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,11 +184,6 @@ include_directories(${TensorFlow_INCLUDE_DIRS})
184184
if (BUILD_CPP_IF)
185185
set (LIB_DEEPMD "deepmd")
186186
set (LIB_DEEPMD_OP "deepmd_op")
187-
if (USE_CUDA_TOOLKIT)
188-
set (LIB_DEEPMD_OP_CUDA "deepmd_op_cuda")
189-
else()
190-
set (LIB_DEEPMD_OP_CUDA "deepmd_op")
191-
endif()
192187
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9)
193188
set (LIB_DEEPMD_NATIVE "deepmd_native_md")
194189
set (LIB_DEEPMD_IPI "deepmd_ipi")

source/lmp/env.sh.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"`
88

99
NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ @OLD_LMP_PPPM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/deepmd "
1010
NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib"
11-
NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
11+
NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"

source/op/CMakeLists.txt

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/NeighborList.cpp)
44

55
set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
6-
file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc)
7-
file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc )
6+
file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc)
7+
file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc)
8+
file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_gpu.cc)
89
file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
910
file(GLOB OP_PY *.py)
1011

@@ -23,8 +24,20 @@ if (BUILD_CPP_IF)
2324
endif (BUILD_CPP_IF)
2425

2526
if (BUILD_PY_IF)
26-
add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
27-
add_library(op_grads SHARED ${OP_GRADS_SRC})
27+
if (USE_CUDA_TOOLKIT)
28+
add_library(op_abi SHARED ${OP_PY_CUDA_SRC} ${OP_LIB})
29+
add_library(op_grads SHARED ${OP_GRADS_SRC})
30+
add_subdirectory(cuda)
31+
find_package(CUDA REQUIRED)
32+
include_directories(${CUDA_INCLUDE_DIRS})
33+
set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda)
34+
target_link_libraries (op_abi ${EXTRA_LIBS})
35+
target_link_libraries (op_grads ${EXTRA_LIBS})
36+
message(STATUS ${TensorFlowFramework_LIBRARY})
37+
else (USE_CUDA_TOOLKIT)
38+
add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
39+
add_library(op_grads SHARED ${OP_GRADS_SRC})
40+
endif(USE_CUDA_TOOLKIT)
2841
target_link_libraries(
2942
op_abi ${TensorFlowFramework_LIBRARY}
3043
)

source/op/_gelu.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/env python3
2+
"""
3+
First-order derivatives and second-order derivatives for gelu function.
4+
"""
5+
6+
from tensorflow.python.framework import ops
7+
from deepmd.env import op_module
8+
9+
@ops.RegisterGradient("Gelu")
10+
def gelu_cc (op, dy) :
11+
return op_module.gelu_grad(dy, op.inputs[0])
12+
13+
@ops.RegisterGradient("GeluGrad")
14+
def gelu_grad_cc (op, dy) :
15+
return [None, op_module.gelu_grad_grad(dy, op.inputs[0], op.inputs[1])]

source/op/cuda/CMakeLists.txt

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,14 @@ else ()
8080
endif()
8181

8282
set (SOURCE_FILES
83-
descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu
83+
descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu gelu.cu
8484
)
8585

86-
cuda_add_library(deepmd_op_cuda SHARED ${SOURCE_FILES})
86+
cuda_add_library(deepmd_op_cuda STATIC ${SOURCE_FILES})
8787

88-
install(TARGETS deepmd_op_cuda DESTINATION lib/)
88+
if (BUILD_CPP_IF)
89+
install(TARGETS deepmd_op_cuda DESTINATION lib/)
90+
endif (BUILD_CPP_IF)
91+
if (BUILD_PY_IF)
92+
install(TARGETS deepmd_op_cuda DESTINATION deepmd/)
93+
endif (BUILD_PY_IF)

source/op/cuda/gelu.cu

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#include <cuda_runtime.h>
2+
#include <stdio.h>
3+
4+
#define SQRT_2_PI 0.7978845608028654
5+
6+
template <typename T>
7+
__global__ void gelu(const T * in, T * out, int const size) {
8+
int const idx = blockIdx.x * blockDim.x + threadIdx.x;
9+
if (idx >= size) {return;}
10+
11+
out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
12+
}
13+
14+
template <typename T>
15+
__global__ void gelu_grad(const T * dy, const T * in, T * out, int const size) {
16+
int const idx = blockIdx.x * blockDim.x + threadIdx.x;
17+
if (idx >= size) {return;}
18+
19+
// out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
20+
T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
21+
out[idx] = dy[idx] * (0.5 * SQRT_2_PI * in[idx] * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1) + 0.5 * var1 + 0.5);
22+
}
23+
24+
template <typename T>
25+
__global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * out, int const size) {
26+
int const idx = blockIdx.x * blockDim.x + threadIdx.x;
27+
if (idx >= size) {return;}
28+
29+
// out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
30+
T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
31+
T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1);
32+
33+
out[idx] = dy[idx] * dy_[idx] * (0.134145 * SQRT_2_PI * in[idx] * in[idx] * (1 - var1 * var1) - SQRT_2_PI * in[idx] * var2 * (0.134145 * in[idx] * in[idx] + 1) * var1 + var2);
34+
}
35+
36+
37+
void GeluLauncher(const float * in, float * out, int const size) {
38+
int const THREAD_ITEMS = 1024;
39+
int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
40+
41+
gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
42+
}
43+
44+
void GeluLauncher(const double * in, double * out, int const size) {
45+
int const THREAD_ITEMS = 1024;
46+
int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
47+
48+
gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
49+
}
50+
51+
void GeluGradLauncher(const float * dy, const float * in, float * out, int const size) {
52+
int const THREAD_ITEMS = 1024;
53+
int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
54+
55+
gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
56+
}
57+
58+
void GeluGradLauncher(const double * dy, const double * in, double * out, int const size) {
59+
int const THREAD_ITEMS = 1024;
60+
int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
61+
62+
gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
63+
}
64+
65+
void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) {
66+
int const THREAD_ITEMS = 1024;
67+
int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
68+
69+
gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
70+
}
71+
72+
void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) {
73+
int const THREAD_ITEMS = 1024;
74+
int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
75+
76+
gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
77+
}

source/op/gelu.cc

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
#include "tensorflow/core/framework/op.h"
2+
#include "tensorflow/core/framework/op_kernel.h"
3+
#include "tensorflow/core/framework/register_types.h"
4+
#include "tensorflow/core/framework/shape_inference.h"
5+
#define SQRT_2_PI 0.7978845608028654
6+
7+
using namespace tensorflow;
8+
using CPUDevice = Eigen::ThreadPoolDevice;
9+
using GPUDevice = Eigen::GpuDevice;
10+
11+
REGISTER_OP("Gelu")
12+
.Attr("T: {float, double}")
13+
.Input("x: T")
14+
.Output("output: T")
15+
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
16+
c->set_output(0, c->input(0));
17+
return Status::OK();
18+
});
19+
20+
REGISTER_OP("GeluGrad")
21+
.Attr("T: {float, double}")
22+
.Input("dy: T")
23+
.Input("x: T")
24+
.Output("output: T")
25+
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
26+
c->set_output(0, c->input(1));
27+
return Status::OK();
28+
});
29+
30+
REGISTER_OP("GeluGradGrad")
31+
.Attr("T: {float, double}")
32+
.Input("dy: T")
33+
.Input("dy_: T")
34+
.Input("x: T")
35+
.Output("output: T")
36+
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
37+
c->set_output(0, c->input(2));
38+
return Status::OK();
39+
});
40+
41+
template <typename Device, typename T>
42+
struct GeluFunctor {
43+
void operator()(const Device& d, const T * in, T * out, int const size) {
44+
#pragma omp parallel for
45+
for (int ii = 0; ii < size; ii++) {
46+
out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] * in[ii])));
47+
}
48+
}
49+
};
50+
51+
template <typename Device, typename T>
52+
struct GeluGradFunctor {
53+
void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) {
54+
#pragma omp parallel for
55+
for (int ii = 0; ii < size; ii++) {
56+
T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
57+
out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
58+
}
59+
}
60+
};
61+
62+
template <typename Device, typename T>
63+
struct GeluGradGradFunctor {
64+
void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
65+
#pragma omp parallel for
66+
for (int ii = 0; ii < size; ii++) {
67+
T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
68+
T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
69+
70+
out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
71+
}
72+
}
73+
};
74+
75+
// OpKernel definition.
76+
// template parameter <T> is the datatype of the tensors.
77+
template <typename Device, typename T>
78+
class GeluOp : public OpKernel {
79+
public :
80+
explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
81+
82+
void Compute(OpKernelContext* context) override {
83+
// Grab the input tensor
84+
const Tensor& x = context->input(0);
85+
86+
Tensor * output = NULL;
87+
int context_output_index = 0;
88+
OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
89+
x.shape(),
90+
&output));
91+
92+
GeluFunctor<Device, T>()(
93+
context->eigen_device<Device>(),
94+
x.flat<T>().data(),
95+
output->flat<T>().data(),
96+
static_cast<int>(output->NumElements())
97+
);
98+
// GeluLauncher(x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
99+
}
100+
};
101+
102+
// OpKernel definition.
103+
// template parameter <T> is the datatype of the tensors.
104+
template <typename Device, typename T>
105+
class GeluGradOp : public OpKernel {
106+
public :
107+
explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
108+
109+
void Compute(OpKernelContext* context) override {
110+
// Grab the input tensor
111+
const Tensor& dy = context->input(0);
112+
const Tensor& x = context->input(1);
113+
114+
Tensor * output = NULL;
115+
int context_output_index = 0;
116+
OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
117+
x.shape(),
118+
&output));
119+
120+
GeluGradFunctor<Device, T>()(
121+
context->eigen_device<Device>(),
122+
dy.flat<T>().data(),
123+
x.flat<T>().data(),
124+
output->flat<T>().data(),
125+
static_cast<int>(output->NumElements())
126+
);
127+
// GeluGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
128+
}
129+
};
130+
131+
// OpKernel definition.
132+
// template parameter <T> is the datatype of the tensors.
133+
template <typename Device, typename T>
134+
class GeluGradGradOp : public OpKernel {
135+
public :
136+
explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
137+
138+
void Compute(OpKernelContext* context) override {
139+
// Grab the input tensor
140+
const Tensor& dy = context->input(0);
141+
const Tensor& dy_ = context->input(1);
142+
const Tensor& x = context->input(2);
143+
144+
Tensor * output = NULL;
145+
int context_output_index = 0;
146+
OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
147+
x.shape(),
148+
&output));
149+
150+
GeluGradGradFunctor<Device, T>()(
151+
context->eigen_device<Device>(),
152+
dy.flat<T>().data(),
153+
dy_.flat<T>().data(),
154+
x.flat<T>().data(),
155+
output->flat<T>().data(),
156+
static_cast<int>(output->NumElements())
157+
);
158+
// GeluGradGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
159+
}
160+
};
161+
162+
#define REGISTER_CPU(T) \
163+
/* Declare explicit instantiations in kernel_example.cu.cc. */ \
164+
REGISTER_KERNEL_BUILDER( \
165+
Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
166+
GeluOp<CPUDevice, T>); \
167+
/* Declare explicit instantiations in kernel_example.cu.cc. */ \
168+
REGISTER_KERNEL_BUILDER( \
169+
Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
170+
GeluGradOp<CPUDevice, T>); \
171+
/* Declare explicit instantiations in kernel_example.cu.cc. */ \
172+
REGISTER_KERNEL_BUILDER( \
173+
Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
174+
GeluGradGradOp<CPUDevice, T>);
175+
REGISTER_CPU(float);
176+
REGISTER_CPU(double);

0 commit comments

Comments
 (0)