@@ -11582,6 +11582,174 @@ static void ggml_compute_forward_conv_transpose_2d(
1158211582 }
1158311583}
1158411584
11585+ // ggml_compute_forward_depthwise_conv_2d
11586+
11587+ struct ggml_depthwise_conv_2d_params {
11588+ int64_t channels;
11589+ int64_t batch;
11590+ int64_t src_w;
11591+ int64_t src_h;
11592+ int64_t dst_w;
11593+ int64_t dst_h;
11594+ int64_t knl_w;
11595+ int64_t knl_h;
11596+ int stride_x;
11597+ int stride_y;
11598+ int pad_x;
11599+ int pad_y;
11600+ };
11601+
11602+ static void ggml_compute_forward_depthwise_conv_2d_cwhn(
11603+ const struct ggml_compute_params * params,
11604+ const struct ggml_tensor * src,
11605+ const struct ggml_tensor * kernel,
11606+ struct ggml_tensor * dst,
11607+ const struct ggml_depthwise_conv_2d_params p) {
11608+
11609+ const int64_t c = p.channels;
11610+ const float * knl_data = (const float *)kernel->data;
11611+
11612+ const int64_t rows_total = p.dst_h * p.batch;
11613+ const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
11614+ const int64_t row_start = params->ith * rows_per_thread;
11615+ const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
11616+
11617+ #ifdef GGML_SIMD
11618+ const int64_t pkg_size = GGML_F32_EPR;
11619+ const int64_t pkg_count = c / pkg_size;
11620+ const int64_t c_pkg_end = pkg_count * pkg_size;
11621+ #else
11622+ const int64_t c_pkg_end = 0;
11623+ #endif
11624+
11625+ for (int64_t row = row_start; row < row_end; ++row) {
11626+ const int64_t dst_y = row % p.dst_h;
11627+ const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
11628+ for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
11629+ float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
11630+ const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
11631+ const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
11632+
11633+ #ifdef GGML_SIMD
11634+ // Vectorized loop
11635+ for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
11636+ GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
11637+ for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
11638+ const int64_t src_y = src_y_base + knl_y;
11639+ if (src_y < 0 || src_y >= p.src_h) {
11640+ continue;
11641+ }
11642+ for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
11643+ const int64_t src_x = src_x_base + knl_x;
11644+ if (src_x < 0 || src_x >= p.src_w) {
11645+ continue;
11646+ }
11647+ GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
11648+ GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
11649+ sum = GGML_F32_VEC_FMA(sum, k, s);
11650+ }
11651+ }
11652+ GGML_F32_VEC_STORE(dst_data + c_i, sum);
11653+ }
11654+ #endif
11655+ // Scalar loop
11656+ for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
11657+ float sum = 0.0f;
11658+ for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
11659+ const int64_t src_y = src_y_base + knl_y;
11660+ if (src_y < 0 || src_y >= p.src_h) {
11661+ continue;
11662+ }
11663+ for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
11664+ const int64_t src_x = src_x_base + knl_x;
11665+ if (src_x < 0 || src_x >= p.src_w) {
11666+ continue;
11667+ }
11668+ sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
11669+ * src_data[(src_y * p.src_w + src_x) * c + c_i];
11670+ }
11671+ }
11672+ dst_data[c_i] = sum;
11673+ }
11674+ }
11675+ }
11676+ }
11677+
11678+ static void ggml_compute_forward_depthwise_conv_2d_whcn(
11679+ const struct ggml_compute_params * params,
11680+ const struct ggml_tensor * src,
11681+ const struct ggml_tensor * kernel,
11682+ struct ggml_tensor * dst,
11683+ const struct ggml_depthwise_conv_2d_params p) {
11684+
11685+ const int64_t n = p.channels * p.batch;
11686+ const int64_t per_thread = (n + params->nth - 1) / params->nth;
11687+ const int64_t start = params->ith * per_thread;
11688+ const int64_t end = MIN(start + per_thread, n);
11689+
11690+ for (int64_t i = start; i < end; ++i) {
11691+ const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
11692+ const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
11693+ float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
11694+
11695+ for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
11696+ for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
11697+
11698+ float sum = 0.0f;
11699+ for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
11700+ const int64_t src_y = dst_y * p.stride_y + knl_y - p.pad_y;
11701+ if (src_y < 0 || src_y >= p.src_h) {
11702+ continue;
11703+ }
11704+ for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
11705+ const int64_t src_x = dst_x * p.stride_x + knl_x - p.pad_x;
11706+ if (src_x < 0 || src_x >= p.src_w) {
11707+ continue;
11708+ }
11709+ sum += knl_data[knl_y * p.knl_w + knl_x]
11710+ * src_data[src_y * p.src_w + src_x];
11711+ }
11712+ }
11713+ dst_data[dst_y * p.dst_w + dst_x] = sum;
11714+ }
11715+ }
11716+ }
11717+ }
11718+
11719+ static void ggml_compute_forward_depthwise_conv_2d(
11720+ const struct ggml_compute_params * params,
11721+ struct ggml_tensor * dst) {
11722+
11723+ const struct ggml_tensor * kernel = dst->src[0];
11724+ const struct ggml_tensor * src = dst->src[1];
11725+ struct ggml_depthwise_conv_2d_params p;
11726+ p.channels = src->ne[2];
11727+ p.batch = src->ne[3];
11728+ p.src_w = src->ne[0];
11729+ p.src_h = src->ne[1];
11730+ p.dst_w = dst->ne[0];
11731+ p.dst_h = dst->ne[1];
11732+ p.knl_w = kernel->ne[0];
11733+ p.knl_h = kernel->ne[1];
11734+ p.stride_x = dst->op_params[0];
11735+ p.stride_y = dst->op_params[1];
11736+ p.pad_x = dst->op_params[2];
11737+ p.pad_y = dst->op_params[3];
11738+
11739+ GGML_ASSERT(kernel->ne[3] == p.channels);
11740+ GGML_ASSERT(dst->ne[3] == p.batch);
11741+
11742+ if (ggml_is_contiguous(src)) {
11743+ ggml_compute_forward_depthwise_conv_2d_whcn(params, src, kernel, dst, p);
11744+ } else if (ggml_is_contiguous_channels(src)) {
11745+ // kernel should also have channels most contiguous in memory
11746+ GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
11747+ ggml_compute_forward_depthwise_conv_2d_cwhn(params, src, kernel, dst, p);
11748+ } else {
11749+ GGML_ABORT("non-contiguous memory layout not supported");
11750+ }
11751+ }
11752+
1158511753// ggml_compute_forward_pool_1d_sk_p0
1158611754
1158711755static void ggml_compute_forward_pool_1d_sk_p0(
@@ -14266,6 +14434,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1426614434 {
1426714435 ggml_compute_forward_im2col_back_f32(params, tensor);
1426814436 } break;
14437+ case GGML_OP_DEPTHWISE_CONV_2D:
14438+ {
14439+ ggml_compute_forward_depthwise_conv_2d(params, tensor);
14440+ } break;
1426914441 case GGML_OP_CONV_TRANSPOSE_2D:
1427014442 {
1427114443 ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -14627,6 +14799,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
1462714799 } break;
1462814800 case GGML_OP_IM2COL:
1462914801 case GGML_OP_IM2COL_BACK:
14802+ case GGML_OP_DEPTHWISE_CONV_2D:
1463014803 case GGML_OP_CONV_TRANSPOSE_1D:
1463114804 case GGML_OP_CONV_TRANSPOSE_2D:
1463214805 {
0 commit comments