added a FP16 FX kernel to deal with fp16 filter data; no need to use FP32 buffer

bssrdf · bssrdf · commit 6afbf6e1c4d6 · 2024-09-28T11:43:45.000-04:00
diff --git a/src/ggml-cuda/conv-winograd.cu b/src/ggml-cuda/conv-winograd.cu
@@ -356,32 +356,32 @@ float4 *input_frag_mem, float4* filter_frag_mem){
 
 
 // Set of functions per row in Gw product
-__device__ float f_row1(float *Gw, int j){
-    return Gw[j];
+__device__ float f_row1(float *G, int j){
+    return G[j];
   }
-  __device__ float f_row2(float *Gw, int j){
-    return 0.5*(Gw[j] + Gw[6+j] + Gw[3+j]);
+  __device__ float f_row2(float *G, int j){
+    return 0.5*(G[j] + G[6+j] + G[3+j]);
   }
-  __device__ float f_row3(float *Gw, int j){
-    return 0.5*(Gw[j] + Gw[6+j] - Gw[3+j]);
+  __device__ float f_row3(float *G, int j){
+    return 0.5*(G[j] + G[6+j] - G[3+j]);
   }
-  __device__ float f_row4(float *Gw, int j){
-    return Gw[6+j];
+  __device__ float f_row4(float *G, int j){
+    return G[6+j];
   }
   // Set of functions per column in GwGt product
-  __device__ float f_col1(float *Gw, int j){
-    return Gw[j];
+  __device__ float f_col1(float *G, int j){
+    return G[j];
   }
-  __device__ float f_col2(float *Gw, int j){
-    return 0.5*(Gw[j] + Gw[j+2] + Gw[j+1]);
+  __device__ float f_col2(float *G, int j){
+    return 0.5*(G[j] + G[j+2] + G[j+1]);
   }
-  __device__ float f_col3(float *Gw, int j){
-    return 0.5*(Gw[j] + Gw[j+2] - Gw[j+1]);
+  __device__ float f_col3(float *G, int j){
+    return 0.5*(G[j] + G[j+2] - G[j+1]);
   }
-  __device__ float f_col4(float *Gw, int j){
-    return Gw[j+2];
+  __device__ float f_col4(float *G, int j){
+    return G[j+2];
   }
-  
+
   typedef float(*pointFunction_t)(float *, int);
   
   __global__ void FX(const float *pInputs, float *pOutputs, int filt_k, 
@@ -403,9 +403,78 @@ __device__ float f_row1(float *Gw, int j){
     pointFunction_t func2[4] = {f_col1, f_col2, f_col3, f_col4};
   
     for(int bk=0; bk<BK; bk+=blockDim.x){
+      // if(blockIdx.x == 0 && blockIdx.y == 0 && Inx == 0 && Iny == 0){
+      //   printf("[");
+      // }
       for(int i=0; i<9; i++){
         Gw[i] = pInputs[c_kernel + i*filt_k];
+        // if(blockIdx.x == 0 && blockIdx.y == 0 && Inx == 0 && Iny == 0){
+        //     printf("(%f,%d) ", Gw[i], c_kernel + i*filt_k); 
+        // }
+      }
+      // if(blockIdx.x == 0 && blockIdx.y == 0 && Inx == 0 && Iny == 0){
+      //   printf("]\n");
+      // }
+  
+      int aux;
+      for(int i=0; i<4; i++){
+        aux = i*3;
+        for(int j=0; j<3; j++){
+          Gw_buffer[j+aux] = (*func1[i])(Gw, j);
+        }
+      }
+      // if(blockIdx.x == 0 && blockIdx.y == 0 && Inx == 0 && Iny == 0){
+      //   printf("X[");
+      //   for(int kk = 0; kk < 21; kk++){
+      //     printf("%f, ", Gw[kk]);
+      //   } 
+      //   printf("]\n");
+      // }
+  
+      int aux2;
+      for(int i=0; i<4; i++){
+        aux = i*3; aux2 = i<<2;
+        for(int j=0; j<4; j++){
+          pOutputs[c_kernel_s+aux2*filt_k+j*filt_k] = (*func2[j])(Gw_buffer, aux);
+        }
+      }
+  
+      c_kernel   += blockDim.x;
+      c_kernel_s += blockDim.x;
+    }
+  }
+
+  __global__ void FX_FP16(const half *pInputs, float *pOutputs, int filt_k, 
+                      int filt_c, int filt_h, int filt_w){
+
+    // assumes CHWK layout                    
+    int Inx = threadIdx.x, Iny = threadIdx.y;
+    int TileX = blockIdx.x, TileY = blockIdx.y;
+  
+    int c_glb_offset = filt_k*filt_h*filt_w;
+    int c_kernel = TileY*BC*c_glb_offset + TileX*BK + Iny*c_glb_offset + Inx;
+    int c_glb_offset_s = filt_k*4*4;
+    int c_kernel_s = TileY*BC*c_glb_offset_s + TileX*BK + Iny*c_glb_offset_s + Inx;
+  
+    float Gw[21]; //9+12. In registers
+    float *Gw_buffer = Gw+9;
+  
+    pointFunction_t func1[4] = {f_row1, f_row2, f_row3, f_row4};
+    pointFunction_t func2[4] = {f_col1, f_col2, f_col3, f_col4};
+  
+    for(int bk=0; bk<BK; bk+=blockDim.x){
+      // if(blockIdx.x == 0 && blockIdx.y == 0 && Inx == 0 && Iny == 0){
+      //   printf("[");
+      // }
+      for(int i=0; i<9; i++){
+        Gw[i] = __half2float(pInputs[c_kernel + i*filt_k]);
+        // if(blockIdx.x == 0 && blockIdx.y == 0 && Inx == 0 && Iny == 0){
+        //     printf("(%f,%d) ", Gw[i], c_kernel + i*filt_k); 
+        // }
       }
+      // if(blockIdx.x == 0 && blockIdx.y == 0 && Inx == 0 && Iny == 0){
+      //   printf("]\n");
+      // }
   
       int aux;
       for(int i=0; i<4; i++){
@@ -414,6 +483,13 @@ __device__ float f_row1(float *Gw, int j){
           Gw_buffer[j+aux] = (*func1[i])(Gw, j);
         }
       }
+      // if(blockIdx.x == 0 && blockIdx.y == 0 && Inx == 0 && Iny == 0){
+      //   printf("X[");
+      //   for(int kk = 0; kk < 21; kk++){
+      //     printf("%f, ", Gw[kk]);
+      //   } 
+      //   printf("]\n");
+      // }
   
       int aux2;
       for(int i=0; i<4; i++){
@@ -730,7 +806,21 @@ static void conv_winograd_stage0_f32_f32_cuda(
     int64_t filt_k = src0_ne0;
     int64_t filt_c = src0_ne3;
 
-    FX<<<dim3(filt_k/BK, filt_c/BC), dim3(32, BC)>>>(src0, dst, filt_k, filt_c, src0_ne2, src0_ne1);
+    FX<<<dim3(filt_k/BK, filt_c/BC), dim3(32, BC), 0, stream>>>(src0, dst, filt_k, filt_c, src0_ne2, src0_ne1);
+    
+}
+
+static void conv_winograd_stage0_f16_f32_cuda(        
+        const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,        
+        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
+        const half * src0, float * dst,
+        cudaStream_t stream) {
+
+    
+    int64_t filt_k = src0_ne0;
+    int64_t filt_c = src0_ne3;
+
+    FX_FP16<<<dim3(filt_k/BK, filt_c/BC), dim3(32, BC), 0, stream>>>(src0, dst, filt_k, filt_c, src0_ne2, src0_ne1);
     
 }
 
@@ -756,38 +846,45 @@ static void conv_winograd_stage1_f32_f32_cuda(int tiles_dim_w, int tiles_dim_h,
     // printf("B %d, %d, %d \n", in_c, in_h, in_w);
     // printf("C %d, %d, %d \n", out_c, out_h, out_w);
 
-    Winograd_kernel<<<dim3((tiles_dim_w+X-1)/X, (tiles_dim_h+Y-1)/Y, filt_k/BK), dim3(BN, 8), smem_size>>>(src1, src0, dst,
+    Winograd_kernel<<<dim3((tiles_dim_w+X-1)/X, (tiles_dim_h+Y-1)/Y, filt_k/BK), dim3(BN, 8), smem_size, stream>>>(src1, src0, dst,
      tiles_dim_w, tiles_dim_h, in_c, in_h, in_w, tile_size, X, Y, filt_k, filt_c, out_c, tile_2d_s, out_h, out_w);    
 }
 
 
 void ggml_cuda_op_winograd_stage0(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
-    // const half * src0_d = (const float *)src0->data;
+    // const half * src0_d = (const half *)src0->data;
     
     float * dst_d = (float *)dst->data;
     cudaStream_t stream = ctx.stream();
-    int id = ggml_cuda_get_device();
+    // int id = ggml_cuda_get_device();
 
     // GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
-    if (src0->type != GGML_TYPE_F32) {
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
-        GGML_ASSERT(to_fp32_cuda != nullptr);
-        int64_t nle = ggml_nelements(src0);
-        src0_ddq_as_f32.alloc(nle);
-        const half * src0_dd = (const half *)src0->data;
-        to_fp32_cuda(src0_dd, src0_ddq_as_f32.get(), nle, stream);
+    // ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
+    // if (src0->type != GGML_TYPE_F32) {
+    //     const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+    //     GGML_ASSERT(to_fp32_cuda != nullptr);
+    //     int64_t nle = ggml_nelements(src0);
+    //     src0_ddq_as_f32.alloc(nle);
+    //     const char * src0_dd = (char *)src0->data;
+    //     to_fp32_cuda(src0_dd, src0_ddq_as_f32.get(), nle, stream);
+    // }
+
+    // // GGML_ASSERT(ggml_is_contiguous(src0));
+    // const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *)src0->data : src0_ddq_as_f32.get();
+    if(src0->type == GGML_TYPE_F32){
+      const float* src0_d = (const float *)src0->data;
+      conv_winograd_stage0_f32_f32_cuda(src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+          dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+           src0_d, dst_d, stream);    
+    }else{
+      const half * src0_d = (const half *)src0->data;
+      conv_winograd_stage0_f16_f32_cuda(src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+          dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+           src0_d, dst_d, stream);    
     }
-
-    // GGML_ASSERT(ggml_is_contiguous(src0));
-    const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *)src0->data : src0_ddq_as_f32.get();
-    
-    conv_winograd_stage0_f32_f32_cuda(src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-        src0_ddf_i, dst_d, stream);
 }
 
 
@@ -822,13 +919,14 @@ void ggml_cuda_op_winograd_stage1(ggml_backend_cuda_context & ctx, ggml_tensor *
     cudaMemcpyToSymbol(access_f_s, aux, 64*sizeof(int));
     cudaMemcpyToSymbol(access_s, aux2, 64*sizeof(int));  
     cudaMemcpyToSymbol(tileid, tid, 64*sizeof(int));
-    // printf(" %d, %d, %d \n", tiles_dim_w, tiles_dim_h, tile_size);
+
     conv_winograd_stage1_f32_f32_cuda(tiles_dim_w, tiles_dim_h, 4, 8, 
         tile_size, tile_2d_s,
         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
         src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
         src0_d, src1_d, dst_d, stream);
+    
 }