Skip to content

Commit e0e94c4

Browse files
author
bssrdf
committed
change mask to unsigned int; add __restric__ to various pointers
1 parent 4e8e0d4 commit e0e94c4

File tree

1 file changed

+29
-30
lines changed

1 file changed

+29
-30
lines changed

src/ggml-cuda/conv-winograd.cu

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,9 @@ __device__ void __inline__ outer_product(float4* input_frag, float4* filter_frag
169169
// extern "C"
170170
// {
171171

172-
__device__ __forceinline__ void transform_output_tile(float *pOutputs, float2 *C_tile, float2 *At,
172+
__device__ __forceinline__ void transform_output_tile(float * __restrict__ pOutputs, float2 *C_tile, float2 *At,
173173
int round, int c_tensor, int c_glb_offset, int i1, int i2,
174-
unsigned short mask1, unsigned short mask2, int out_w)
174+
unsigned int mask1, unsigned int mask2, int out_w)
175175
{
176176

177177
c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset;
@@ -208,10 +208,10 @@ __device__ __forceinline__ void transform_output_tile(float *pOutputs, float2 *
208208
}
209209
}
210210

211-
__device__ __forceinline__ unsigned short get_mask(int idd, int tiles_dim_w, int tiles_dim_h,
211+
__device__ __forceinline__ unsigned int get_mask(int idd, int tiles_dim_w, int tiles_dim_h,
212212
int tw, int th, int out_w, int out_h){
213213

214-
unsigned short mask = 0x000F;
214+
unsigned int mask = 0x000F;
215215
// if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003; // pad bottom row
216216
// if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005; // pad right col
217217
// if(blockIdx.y==gridDim.y-1 && (idd / tw) == th-1 && out_h%2) mask&=0x0003; // pad bottom row
@@ -242,7 +242,7 @@ __device__ __forceinline__ unsigned short get_mask(int idd, int tiles_dim_w, int
242242
return mask;
243243
}
244244

245-
__device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C,
245+
__device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float * __restrict__ C,
246246
int out_h, int out_w, int tiles_dim_w, int tiles_dim_h, int tw, int th,
247247
float4 *input_frag_mem, float4* filter_frag_mem){
248248

@@ -271,8 +271,8 @@ float4 *input_frag_mem, float4* filter_frag_mem){
271271
int id2 = (idd2 % tw) * 2 + (idd2 / tw) * out_w * 2;
272272

273273
// unsigned short mask1 = 0x000F;
274-
unsigned short mask1 = get_mask(idd1, tiles_dim_w, tiles_dim_h, tw, th, out_w, out_h);
275-
unsigned short mask2 = get_mask(idd2, tiles_dim_w, tiles_dim_h, tw, th, out_w, out_h);
274+
unsigned int mask1 = get_mask(idd1, tiles_dim_w, tiles_dim_h, tw, th, out_w, out_h);
275+
unsigned int mask2 = get_mask(idd2, tiles_dim_w, tiles_dim_h, tw, th, out_w, out_h);
276276

277277
// output transpose step
278278
int t=0;
@@ -355,29 +355,29 @@ float4 *input_frag_mem, float4* filter_frag_mem){
355355

356356

357357
// Set of functions per row in Gw product
358-
__device__ float f_row1(float *G, int j){
358+
__device__ float f_row1(float * __restrict__ G, int j){
359359
return G[j];
360360
}
361-
__device__ float f_row2(float *G, int j){
362-
return 0.5*(G[j] + G[6+j] + G[3+j]);
361+
__device__ float f_row2(float * __restrict__ G, int j){
362+
return 0.5f*(G[j] + G[6+j] + G[3+j]);
363363
}
364-
__device__ float f_row3(float *G, int j){
365-
return 0.5*(G[j] + G[6+j] - G[3+j]);
364+
__device__ float f_row3(float * __restrict__ G, int j){
365+
return 0.5f*(G[j] + G[6+j] - G[3+j]);
366366
}
367-
__device__ float f_row4(float *G, int j){
367+
__device__ float f_row4(float * __restrict__ G, int j){
368368
return G[6+j];
369369
}
370370
// Set of functions per column in GwGt product
371-
__device__ float f_col1(float *G, int j){
371+
__device__ float f_col1(float * __restrict__ G, int j){
372372
return G[j];
373373
}
374-
__device__ float f_col2(float *G, int j){
375-
return 0.5*(G[j] + G[j+2] + G[j+1]);
374+
__device__ float f_col2(float * __restrict__ G, int j){
375+
return 0.5f*(G[j] + G[j+2] + G[j+1]);
376376
}
377-
__device__ float f_col3(float *G, int j){
378-
return 0.5*(G[j] + G[j+2] - G[j+1]);
377+
__device__ float f_col3(float * __restrict__ G, int j){
378+
return 0.5f*(G[j] + G[j+2] - G[j+1]);
379379
}
380-
__device__ float f_col4(float *G, int j){
380+
__device__ float f_col4(float * __restrict__ G, int j){
381381
return G[j+2];
382382
}
383383

@@ -394,10 +394,10 @@ __device__ float f_row1(float *G, int j){
394394
typedef float(*pointFunction_t)(float *, int);
395395

396396
template<typename T>
397-
__global__ void FX(const T *pInputs, float *pOutputs, int filt_k,
397+
__global__ void FX(const T * __restrict__ pInputs, float * __restrict__ pOutputs, int filt_k,
398398
int filt_c, int filt_h, int filt_w){
399399

400-
// assumes KCHW layout
400+
// assumes KCHW layout
401401
int Inx = threadIdx.x, Iny = threadIdx.y;
402402
int TileX = blockIdx.x, TileY = blockIdx.y;
403403

@@ -418,7 +418,6 @@ __device__ float f_row1(float *G, int j){
418418
for(int bk=0; bk<BK; bk+=blockDim.x){
419419
for(int i=0; i<9; i++){
420420
Gw[i] = t2f32(pInputs[c_kernel + i]);
421-
422421
}
423422

424423
int aux;
@@ -428,7 +427,7 @@ __device__ float f_row1(float *G, int j){
428427
Gw_buffer[j+aux] = (*func1[i])(Gw, j);
429428
}
430429
}
431-
430+
432431
int aux2;
433432
for(int i=0; i<4; i++){
434433
aux = i*3; aux2 = i<<2;
@@ -444,7 +443,7 @@ __device__ float f_row1(float *G, int j){
444443

445444
#define d(input, i, j) ( input[(i<<2) + (j)] )
446445

447-
__device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float *pOutputs){
446+
__device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float * __restrict__ pOutputs){
448447

449448
float workspace[3];
450449

@@ -473,7 +472,7 @@ __device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float
473472

474473
}
475474

476-
__device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs,
475+
__device__ __forceinline__ void load_filter_tile(float *tiles, float * __restrict__ pOutputs,
477476
int filt_c, int filt_k){
478477

479478
int c_tensor_s = threadIdx.y*BK + threadIdx.x;
@@ -501,7 +500,7 @@ __device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs,
501500

502501
}
503502

504-
__device__ __forceinline__ void prefetch_filter_tile(const float *pInputs, float *tiles, int filt_k){
503+
__device__ __forceinline__ void prefetch_filter_tile(const float * __restrict__ pInputs, float * __restrict__ tiles, int filt_k){
505504

506505
int c_tensor = blockIdx.z*BK + (threadIdx.y*filt_k<<4) + threadIdx.x; // Iny*filt_k*4*4
507506
// each threadIdx.y corresponds to one channel; there are 8 different threadIdx.y so 8 channels
@@ -521,7 +520,7 @@ __device__ __forceinline__ void prefetch_filter_tile(const float *pInputs, float
521520
}
522521
}
523522

524-
__device__ __forceinline__ void prefetch_input_tile(const float *pInputs, float *tile, int in_h,
523+
__device__ __forceinline__ void prefetch_input_tile(const float * __restrict__ pInputs, float *tile, int in_h,
525524
int in_w, int tw, int th, unsigned short mask){
526525

527526
// load one input tile
@@ -598,7 +597,7 @@ __global__ void Winograd_kernel(const float *A, const float *B, float *C,
598597
float *input_smem = (float*)shared_mem;
599598
float *filter_smem = (float*)&shared_mem[16*BC*BN];
600599

601-
unsigned short m = 0xFFFF;
600+
unsigned int m = 0xFFFF;
602601

603602
if(blockIdx.y==0 && (threadIdx.x / X) == 0) m &= 0xFFF0; // pad top row
604603
if(tiles_dim_w % X == 0 && tiles_dim_h % Y == 0){
@@ -734,7 +733,7 @@ cudaError_t convolutionForward_32Tx64x8(float *k, int in_h, int in_w, float *w,
734733
// }
735734

736735
template<typename T>
737-
static void conv_winograd_stage0_f32_cuda(
736+
static void conv_winograd_stage0_f32_cuda(
738737
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
739738
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
740739
const T * src0, float * dst,
@@ -764,7 +763,7 @@ static void conv_winograd_stage1_f32_f32_cuda(int tiles_dim_w, int tiles_dim_h,
764763

765764
Winograd_kernel<<<dim3((tiles_dim_w+X-1)/X, (tiles_dim_h+Y-1)/Y, filt_k/BK), dim3(BN, 8), smem_size, stream>>>(src1, src0, dst,
766765
tiles_dim_w, tiles_dim_h, in_c, in_h, in_w, tile_size, X, Y,
767-
filt_k, filt_c, out_c, tile_2d_s, out_h, out_w);
766+
filt_k, filt_c, out_c, tile_2d_s, out_h, out_w);
768767
}
769768

770769

0 commit comments

Comments
 (0)