@@ -229,8 +229,8 @@ __global__ void DCNIm2colKernel(
229229 }
230230}
231231
232- template <typename _T >
233- static __global__ void biasKernel (_T * data_input, const _T * bias, const int f_area, int edge) {
232+ template <typename DataType >
233+ static __global__ void biasKernel (DataType * data_input, const DataType * bias, const int f_area, int edge) {
234234
235235 KernelPositionBlock;
236236 int bias_index = position / f_area;
@@ -276,7 +276,7 @@ inline void segemm_native(cublasHandle_t handle,
276276 cublasCheck (cublasGemmEx (handle, transa, transb, m, n, k, &halpha, A, CUDA_R_16F, lda, B, CUDA_R_16F, ldb, &hbeta, C, CUDA_R_16F, ldc, CUDA_R_16F, CUBLAS_GEMM_DFALT));
277277}
278278
279- template <typename _T >
279+ template <typename DataType >
280280static void enqueue_native (cublasHandle_t handle, const std::vector<GTensor>& inputs, std::vector<GTensor>& outputs, const std::vector<GTensor>& weights, void * workspace, cudaStream_t stream) {
281281 auto & data = inputs[0 ];
282282 auto & om = inputs[1 ];
@@ -295,16 +295,16 @@ static void enqueue_native(cublasHandle_t handle, const std::vector<GTensor>& in
295295
296296 cublasCheck (cublasSetStream (handle, stream));
297297 for (int ibatch = 0 ; ibatch < data.batch (); ++ibatch) {
298- _T * maskWorkspacePtr = (_T *)workspace + (maskSize + im2colSize) * ibatch;
299- _T * im2colWorkspacePtr = (_T *)workspace + (maskSize + im2colSize) * ibatch + maskSize;
298+ DataType * maskWorkspacePtr = (DataType *)workspace + (maskSize + im2colSize) * ibatch;
299+ DataType * im2colWorkspacePtr = (DataType *)workspace + (maskSize + im2colSize) * ibatch + maskSize;
300300
301- _T * inputMask = om.ptr <_T >(ibatch, om.channel () / 3 * 2 );
301+ DataType * inputMask = om.ptr <DataType >(ibatch, om.channel () / 3 * 2 );
302302 checkCudaKernel (
303303 sigmoidKernel<<<CUDATools::grid_dims(maskSize), CUDATools::block_dims(maskSize), 0 , stream>>> (inputMask, maskWorkspacePtr, maskSize);
304304 );
305305
306- _T * datainput = data.ptr <_T >(ibatch);
307- _T * offset = om.ptr <_T >(ibatch);
306+ DataType * datainput = data.ptr <DataType >(ibatch);
307+ DataType * offset = om.ptr <DataType >(ibatch);
308308
309309 auto jobs = (size_t )data.channel () * out.height () * out.width ();
310310 checkCudaKernel (
@@ -314,17 +314,17 @@ static void enqueue_native(cublasHandle_t handle, const std::vector<GTensor>& in
314314 );
315315 );
316316
317- _T * weightKernel = weights[0 ].ptr <_T >();
318- segemm_native (handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, alpha, im2colWorkspacePtr, n, weightKernel, k, beta, out.ptr <_T >(ibatch), n);
317+ DataType * weightKernel = weights[0 ].ptr <DataType >();
318+ segemm_native (handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, alpha, im2colWorkspacePtr, n, weightKernel, k, beta, out.ptr <DataType >(ibatch), n);
319319
320320 if (weights.size () > 1 ) {
321- _T * weightBias = weights[1 ].ptr <_T >();
321+ DataType * weightBias = weights[1 ].ptr <DataType >();
322322 size_t edge = out.count (1 );
323323 size_t area = out.count (2 );
324324
325325 checkCudaKernel (
326326 biasKernel<<<CUDATools::grid_dims(edge), CUDATools::block_dims(edge), 0 , stream>>> (
327- out.ptr <_T >(ibatch), weightBias, area, edge
327+ out.ptr <DataType >(ibatch), weightBias, area, edge
328328 );
329329 );
330330 }
0 commit comments