@@ -169,9 +169,9 @@ __device__ void __inline__ outer_product(float4* input_frag, float4* filter_frag
169169// extern "C"
170170// {
171171
172- __device__ __forceinline__ void transform_output_tile (float *pOutputs, float2 *C_tile, float2 *At,
172+ __device__ __forceinline__ void transform_output_tile (float * __restrict__ pOutputs, float2 *C_tile, float2 *At,
173173 int round, int c_tensor, int c_glb_offset, int i1, int i2,
174- unsigned short mask1, unsigned short mask2, int out_w)
174+ unsigned int mask1, unsigned int mask2, int out_w)
175175{
176176
177177 c_tensor += (((round)/2 )*32 + ((round)%2 )*2 )*c_glb_offset;
@@ -208,10 +208,10 @@ __device__ __forceinline__ void transform_output_tile(float *pOutputs, float2 *
208208 }
209209}
210210
211- __device__ __forceinline__ unsigned short get_mask (int idd, int tiles_dim_w, int tiles_dim_h,
211+ __device__ __forceinline__ unsigned int get_mask (int idd, int tiles_dim_w, int tiles_dim_h,
212212 int tw, int th, int out_w, int out_h){
213213
214- unsigned short mask = 0x000F ;
214+ unsigned int mask = 0x000F ;
215215 // if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003; // pad bottom row
216216 // if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005; // pad right col
217217 // if(blockIdx.y==gridDim.y-1 && (idd / tw) == th-1 && out_h%2) mask&=0x0003; // pad bottom row
@@ -242,7 +242,7 @@ __device__ __forceinline__ unsigned short get_mask(int idd, int tiles_dim_w, int
242242 return mask;
243243}
244244
245- __device__ __forceinline__ void store_output_tile (float4 acumm_smem[][16 ], float *shared_mem, float *C,
245+ __device__ __forceinline__ void store_output_tile (float4 acumm_smem[][16 ], float *shared_mem, float * __restrict__ C,
246246int out_h, int out_w, int tiles_dim_w, int tiles_dim_h, int tw, int th,
247247float4 *input_frag_mem, float4 * filter_frag_mem){
248248
@@ -271,8 +271,8 @@ float4 *input_frag_mem, float4* filter_frag_mem){
271271 int id2 = (idd2 % tw) * 2 + (idd2 / tw) * out_w * 2 ;
272272
273273 // unsigned short mask1 = 0x000F;
274- unsigned short mask1 = get_mask (idd1, tiles_dim_w, tiles_dim_h, tw, th, out_w, out_h);
275- unsigned short mask2 = get_mask (idd2, tiles_dim_w, tiles_dim_h, tw, th, out_w, out_h);
274+ unsigned int mask1 = get_mask (idd1, tiles_dim_w, tiles_dim_h, tw, th, out_w, out_h);
275+ unsigned int mask2 = get_mask (idd2, tiles_dim_w, tiles_dim_h, tw, th, out_w, out_h);
276276
277277 // output transpose step
278278 int t=0 ;
@@ -355,29 +355,29 @@ float4 *input_frag_mem, float4* filter_frag_mem){
355355
356356
357357// Set of functions per row in Gw product
358- __device__ float f_row1 (float *G, int j){
358+ __device__ float f_row1 (float * __restrict__ G, int j){
359359 return G[j];
360360 }
361- __device__ float f_row2 (float *G, int j){
362- return 0.5 *(G[j] + G[6 +j] + G[3 +j]);
361+ __device__ float f_row2 (float * __restrict__ G, int j){
362+ return 0 .5f *(G[j] + G[6 +j] + G[3 +j]);
363363 }
364- __device__ float f_row3 (float *G, int j){
365- return 0.5 *(G[j] + G[6 +j] - G[3 +j]);
364+ __device__ float f_row3 (float * __restrict__ G, int j){
365+ return 0 .5f *(G[j] + G[6 +j] - G[3 +j]);
366366 }
367- __device__ float f_row4 (float *G, int j){
367+ __device__ float f_row4 (float * __restrict__ G, int j){
368368 return G[6 +j];
369369 }
370370 // Set of functions per column in GwGt product
371- __device__ float f_col1 (float *G, int j){
371+ __device__ float f_col1 (float * __restrict__ G, int j){
372372 return G[j];
373373 }
374- __device__ float f_col2 (float *G, int j){
375- return 0.5 *(G[j] + G[j+2 ] + G[j+1 ]);
374+ __device__ float f_col2 (float * __restrict__ G, int j){
375+ return 0 .5f *(G[j] + G[j+2 ] + G[j+1 ]);
376376 }
377- __device__ float f_col3 (float *G, int j){
378- return 0.5 *(G[j] + G[j+2 ] - G[j+1 ]);
377+ __device__ float f_col3 (float * __restrict__ G, int j){
378+ return 0 .5f *(G[j] + G[j+2 ] - G[j+1 ]);
379379 }
380- __device__ float f_col4 (float *G, int j){
380+ __device__ float f_col4 (float * __restrict__ G, int j){
381381 return G[j+2 ];
382382 }
383383
@@ -394,10 +394,10 @@ __device__ float f_row1(float *G, int j){
394394 typedef float (*pointFunction_t)(float *, int );
395395
396396 template <typename T>
397- __global__ void FX (const T *pInputs, float *pOutputs, int filt_k,
397+ __global__ void FX (const T * __restrict__ pInputs, float * __restrict__ pOutputs, int filt_k,
398398 int filt_c, int filt_h, int filt_w){
399399
400- // assumes KCHW layout
400+ // assumes KCHW layout
401401 int Inx = threadIdx .x , Iny = threadIdx .y ;
402402 int TileX = blockIdx .x , TileY = blockIdx .y ;
403403
@@ -418,7 +418,6 @@ __device__ float f_row1(float *G, int j){
418418 for (int bk=0 ; bk<BK; bk+=blockDim .x ){
419419 for (int i=0 ; i<9 ; i++){
420420 Gw[i] = t2f32 (pInputs[c_kernel + i]);
421-
422421 }
423422
424423 int aux;
@@ -428,7 +427,7 @@ __device__ float f_row1(float *G, int j){
428427 Gw_buffer[j+aux] = (*func1[i])(Gw, j);
429428 }
430429 }
431-
430+
432431 int aux2;
433432 for (int i=0 ; i<4 ; i++){
434433 aux = i*3 ; aux2 = i<<2 ;
@@ -444,7 +443,7 @@ __device__ float f_row1(float *G, int j){
444443
445444#define d (input, i, j ) ( input[(i<<2 ) + (j)] )
446445
447- __device__ __forceinline__ void load_and_transform_input_tile (float *Btd, float *pOutputs){
446+ __device__ __forceinline__ void load_and_transform_input_tile (float *Btd, float * __restrict__ pOutputs){
448447
449448 float workspace[3 ];
450449
@@ -473,7 +472,7 @@ __device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float
473472
474473}
475474
476- __device__ __forceinline__ void load_filter_tile (float *tiles, float *pOutputs,
475+ __device__ __forceinline__ void load_filter_tile (float *tiles, float * __restrict__ pOutputs,
477476 int filt_c, int filt_k){
478477
479478 int c_tensor_s = threadIdx .y *BK + threadIdx .x ;
@@ -501,7 +500,7 @@ __device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs,
501500
502501}
503502
504- __device__ __forceinline__ void prefetch_filter_tile (const float *pInputs, float *tiles, int filt_k){
503+ __device__ __forceinline__ void prefetch_filter_tile (const float * __restrict__ pInputs, float * __restrict__ tiles, int filt_k){
505504
506505 int c_tensor = blockIdx .z *BK + (threadIdx .y *filt_k<<4 ) + threadIdx .x ; // Iny*filt_k*4*4
507506 // each threadIdx.y corresponds to one channel; there are 8 different threadIdx.y so 8 channels
@@ -521,7 +520,7 @@ __device__ __forceinline__ void prefetch_filter_tile(const float *pInputs, float
521520 }
522521}
523522
524- __device__ __forceinline__ void prefetch_input_tile (const float *pInputs, float *tile, int in_h,
523+ __device__ __forceinline__ void prefetch_input_tile (const float * __restrict__ pInputs, float *tile, int in_h,
525524 int in_w, int tw, int th, unsigned short mask){
526525
527526 // load one input tile
@@ -598,7 +597,7 @@ __global__ void Winograd_kernel(const float *A, const float *B, float *C,
598597 float *input_smem = (float *)shared_mem;
599598 float *filter_smem = (float *)&shared_mem[16 *BC*BN];
600599
601- unsigned short m = 0xFFFF ;
600+ unsigned int m = 0xFFFF ;
602601
603602 if (blockIdx .y ==0 && (threadIdx .x / X) == 0 ) m &= 0xFFF0 ; // pad top row
604603 if (tiles_dim_w % X == 0 && tiles_dim_h % Y == 0 ){
@@ -734,7 +733,7 @@ cudaError_t convolutionForward_32Tx64x8(float *k, int in_h, int in_w, float *w,
734733// }
735734
736735template <typename T>
737- static void conv_winograd_stage0_f32_cuda (
736+ static void conv_winograd_stage0_f32_cuda (
738737 const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
739738 const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
740739 const T * src0, float * dst,
@@ -764,7 +763,7 @@ static void conv_winograd_stage1_f32_f32_cuda(int tiles_dim_w, int tiles_dim_h,
764763
765764 Winograd_kernel<<<dim3 ((tiles_dim_w+X-1 )/X, (tiles_dim_h+Y-1 )/Y, filt_k/BK), dim3 (BN, 8 ), smem_size, stream>>> (src1, src0, dst,
766765 tiles_dim_w, tiles_dim_h, in_c, in_h, in_w, tile_size, X, Y,
767- filt_k, filt_c, out_c, tile_2d_s, out_h, out_w);
766+ filt_k, filt_c, out_c, tile_2d_s, out_h, out_w);
768767}
769768
770769
0 commit comments