@@ -6064,6 +6064,174 @@ void ggml_compute_forward_conv_transpose_2d(
60646064    }
60656065}
60666066
6067+ //  ggml_compute_forward_depthwise_conv_2d
6068+ 
6069+ struct  ggml_depthwise_conv_2d_params  {
6070+     int64_t  channels;
6071+     int64_t  batch;
6072+     int64_t  src_w;
6073+     int64_t  src_h;
6074+     int64_t  dst_w;
6075+     int64_t  dst_h;
6076+     int64_t  knl_w;
6077+     int64_t  knl_h;
6078+     int  stride_x;
6079+     int  stride_y;
6080+     int  pad_x;
6081+     int  pad_y;
6082+ };
6083+ 
6084+ static  void  ggml_compute_forward_depthwise_conv_2d_cwhn (
6085+         const  struct  ggml_compute_params  * params,
6086+         const  struct  ggml_tensor  * src,
6087+         const  struct  ggml_tensor  * kernel,
6088+         struct  ggml_tensor  * dst,
6089+         const  struct  ggml_depthwise_conv_2d_params  p) {
6090+ 
6091+     const  int64_t  c = p.channels ;
6092+     const  float  * knl_data = (const  float  *)kernel->data ;
6093+ 
6094+     const  int64_t  rows_total = p.dst_h  * p.batch ;
6095+     const  int64_t  rows_per_thread = (rows_total + params->nth  - 1 ) / params->nth ;
6096+     const  int64_t  row_start = params->ith  * rows_per_thread;
6097+     const  int64_t  row_end = MIN (row_start + rows_per_thread, rows_total);
6098+ 
6099+ #ifdef  GGML_SIMD
6100+     const  int64_t  pkg_size = GGML_F32_EPR;
6101+     const  int64_t  pkg_count = c / pkg_size;
6102+     const  int64_t  c_pkg_end = pkg_count * pkg_size;
6103+ #else 
6104+     const  int64_t  c_pkg_end = 0 ;
6105+ #endif 
6106+ 
6107+     for  (int64_t  row = row_start; row < row_end; ++row) {
6108+         const  int64_t  dst_y = row % p.dst_h ;
6109+         const  float  * src_data = (const  float  *)src->data  + (row / p.dst_h ) * p.src_w  * p.src_h  * c;
6110+         for  (int64_t  dst_x = 0 ; dst_x < p.dst_w ; ++dst_x) {
6111+             float  * dst_data = (float  *)dst->data  + (row * p.dst_w  + dst_x) * c;
6112+             const  int64_t  src_y_base = dst_y * p.stride_y  - p.pad_y ;
6113+             const  int64_t  src_x_base = dst_x * p.stride_x  - p.pad_x ;
6114+ 
6115+ #ifdef  GGML_SIMD
6116+             //  Vectorized loop
6117+             for  (int64_t  c_i = 0 ; c_i < c_pkg_end; c_i += pkg_size) {
6118+                 GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
6119+                 for  (int64_t  knl_y = 0 ; knl_y < p.knl_h ; ++knl_y) {
6120+                     const  int64_t  src_y = src_y_base + knl_y;
6121+                     if  (src_y < 0  || src_y >= p.src_h ) {
6122+                         continue ;
6123+                     }
6124+                     for  (int64_t  knl_x = 0 ; knl_x < p.knl_w ; ++knl_x) {
6125+                         const  int64_t  src_x = src_x_base + knl_x;
6126+                         if  (src_x < 0  || src_x >= p.src_w ) {
6127+                             continue ;
6128+                         }
6129+                         GGML_F32_VEC k = GGML_F32_VEC_LOAD (knl_data + (knl_y * p.knl_w  + knl_x) * c + c_i);
6130+                         GGML_F32_VEC s = GGML_F32_VEC_LOAD (src_data + (src_y * p.src_w  + src_x) * c + c_i);
6131+                         sum = GGML_F32_VEC_FMA (sum, k, s);
6132+                     }
6133+                 }
6134+                 GGML_F32_VEC_STORE (dst_data + c_i, sum);
6135+             }
6136+ #endif 
6137+             //  Scalar loop
6138+             for  (int64_t  c_i = c_pkg_end; c_i < c; ++c_i) {
6139+                 float  sum = 0 .0f ;
6140+                 for  (int64_t  knl_y = 0 ; knl_y < p.knl_h ; ++knl_y) {
6141+                     const  int64_t  src_y = src_y_base + knl_y;
6142+                     if  (src_y < 0  || src_y >= p.src_h ) {
6143+                         continue ;
6144+                     }
6145+                     for  (int64_t  knl_x = 0 ; knl_x < p.knl_w ; ++knl_x) {
6146+                         const  int64_t  src_x = src_x_base + knl_x;
6147+                         if  (src_x < 0  || src_x >= p.src_w ) {
6148+                             continue ;
6149+                         }
6150+                         sum += knl_data[(knl_y * p.knl_w  + knl_x) * c + c_i]
6151+                              * src_data[(src_y * p.src_w  + src_x) * c + c_i];
6152+                     }
6153+                 }
6154+                 dst_data[c_i] = sum;
6155+             }
6156+         }
6157+     }
6158+ }
6159+ 
6160+ static  void  ggml_compute_forward_depthwise_conv_2d_whcn (
6161+         const  struct  ggml_compute_params  * params,
6162+         const  struct  ggml_tensor  * src,
6163+         const  struct  ggml_tensor  * kernel,
6164+         struct  ggml_tensor  * dst,
6165+         const  struct  ggml_depthwise_conv_2d_params  p) {
6166+ 
6167+     const  int64_t  n = p.channels  * p.batch ;
6168+     const  int64_t  per_thread = (n + params->nth  - 1 ) / params->nth ;
6169+     const  int64_t  start = params->ith  * per_thread;
6170+     const  int64_t  end = MIN (start + per_thread, n);
6171+ 
6172+     for  (int64_t  i = start; i < end; ++i) {
6173+         const  float  * knl_data = (const  float  *)kernel->data  + (i % p.channels ) * p.knl_w  * p.knl_h ;
6174+         const  float  * src_data = (const  float  *)src->data  + i * p.src_w  * p.src_h ;
6175+         float  * dst_data = (float  *)dst->data  + i * p.dst_w  * p.dst_h ;
6176+ 
6177+         for  (int64_t  dst_y = 0 ; dst_y < p.dst_h ; ++dst_y) {
6178+             for  (int64_t  dst_x = 0 ; dst_x < p.dst_w ; ++dst_x) {
6179+ 
6180+                 float  sum = 0 .0f ;                
6181+                 for  (int64_t  knl_y = 0 ; knl_y < p.knl_h ; ++knl_y) {
6182+                     const  int64_t  src_y = dst_y * p.stride_y  + knl_y - p.pad_y ;
6183+                     if  (src_y < 0  || src_y >= p.src_h ) {
6184+                         continue ;
6185+                     }
6186+                     for  (int64_t  knl_x = 0 ; knl_x < p.knl_w ; ++knl_x) {
6187+                         const  int64_t  src_x = dst_x * p.stride_x  + knl_x - p.pad_x ;
6188+                         if  (src_x < 0  || src_x >= p.src_w ) {
6189+                             continue ;
6190+                         }
6191+                         sum += knl_data[knl_y * p.knl_w  + knl_x]
6192+                              * src_data[src_y * p.src_w  + src_x];
6193+                     }
6194+                 }
6195+                 dst_data[dst_y * p.dst_w  + dst_x] = sum;
6196+             }
6197+         }
6198+     }
6199+ }
6200+ 
6201+ static  void  ggml_compute_forward_depthwise_conv_2d (
6202+         const  struct  ggml_compute_params  * params,
6203+         struct  ggml_tensor  * dst) {
6204+ 
6205+     const  struct  ggml_tensor  * kernel = dst->src [0 ];
6206+     const  struct  ggml_tensor  * src = dst->src [1 ];
6207+     struct  ggml_depthwise_conv_2d_params  p;
6208+     p.channels  = src->ne [2 ];
6209+     p.batch  = src->ne [3 ];
6210+     p.src_w  = src->ne [0 ];
6211+     p.src_h  = src->ne [1 ];
6212+     p.dst_w  = dst->ne [0 ];
6213+     p.dst_h  = dst->ne [1 ];
6214+     p.knl_w  = kernel->ne [0 ];
6215+     p.knl_h  = kernel->ne [1 ];
6216+     p.stride_x  = dst->op_params [0 ];
6217+     p.stride_y  = dst->op_params [1 ];
6218+     p.pad_x  = dst->op_params [2 ];
6219+     p.pad_y  = dst->op_params [3 ];
6220+ 
6221+     GGML_ASSERT (kernel->ne [3 ] == p.channels );
6222+     GGML_ASSERT (dst->ne [3 ] == p.batch );
6223+ 
6224+     if  (ggml_is_contiguous (src)) {
6225+         ggml_compute_forward_depthwise_conv_2d_whcn (params, src, kernel, dst, p);
6226+     } else  if  (ggml_is_contiguous_channels (src)) {
6227+         //  kernel should also have channels most contiguous in memory
6228+         GGML_ASSERT (kernel->nb [0 ] >= kernel->nb [2 ] && kernel->nb [1 ] >= kernel->nb [0 ]);
6229+         ggml_compute_forward_depthwise_conv_2d_cwhn (params, src, kernel, dst, p);
6230+     } else  {
6231+         GGML_ABORT (" non-contiguous memory layout not supported"  );
6232+     }
6233+ }
6234+ 
60676235//  ggml_compute_forward_pool_1d_sk_p0
60686236
60696237static  void  ggml_compute_forward_pool_1d_sk_p0 (
0 commit comments