Skip to content

Commit 352c9c0

Browse files
committed
ggml-cpu : kernels for faster depthwise 2D convolution
1 parent 63015d2 commit 352c9c0

File tree

7 files changed

+395
-3
lines changed

7 files changed

+395
-3
lines changed

include/ggml.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,7 @@ extern "C" {
481481
GGML_OP_CONV_TRANSPOSE_1D,
482482
GGML_OP_IM2COL,
483483
GGML_OP_IM2COL_BACK,
484+
GGML_OP_DEPTHWISE_CONV_2D,
484485
GGML_OP_CONV_TRANSPOSE_2D,
485486
GGML_OP_POOL_1D,
486487
GGML_OP_POOL_2D,
@@ -677,6 +678,9 @@ extern "C" {
677678
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
678679
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
679680

681+
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
682+
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
683+
680684
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
681685
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
682686

@@ -1660,7 +1664,7 @@ extern "C" {
16601664
struct ggml_tensor * a,
16611665
struct ggml_tensor * b);
16621666

1663-
// depthwise
1667+
// depthwise (via im2col and mul_mat)
16641668
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
16651669
struct ggml_context * ctx,
16661670
struct ggml_tensor * a, // convolution kernel
@@ -1672,6 +1676,19 @@ extern "C" {
16721676
int d0, // dilation dimension 0
16731677
int d1); // dilation dimension 1
16741678

1679+
// Depthwise 2D convolution
1680+
// a: KW KH 1 C convolution kernel
1681+
// b: W H C N input data
1682+
// res: W_out H_out C N
1683+
GGML_API struct ggml_tensor * ggml_depthwise_conv_2d(
1684+
struct ggml_context * ctx,
1685+
struct ggml_tensor * a,
1686+
struct ggml_tensor * b,
1687+
int stride0,
1688+
int stride1,
1689+
int pad0,
1690+
int pad1);
1691+
16751692
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
16761693
struct ggml_context * ctx,
16771694
struct ggml_tensor * a,

src/ggml-cpu/ggml-cpu.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1932,6 +1932,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
19321932
{
19331933
ggml_compute_forward_im2col_back_f32(params, tensor);
19341934
} break;
1935+
case GGML_OP_DEPTHWISE_CONV_2D:
1936+
{
1937+
ggml_compute_forward_depthwise_conv_2d(params, tensor);
1938+
} break;
19351939
case GGML_OP_CONV_TRANSPOSE_2D:
19361940
{
19371941
ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -2268,6 +2272,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
22682272
} break;
22692273
case GGML_OP_IM2COL:
22702274
case GGML_OP_IM2COL_BACK:
2275+
case GGML_OP_DEPTHWISE_CONV_2D:
22712276
case GGML_OP_CONV_TRANSPOSE_1D:
22722277
case GGML_OP_CONV_TRANSPOSE_2D:
22732278
{

src/ggml-cpu/ops.cpp

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6064,6 +6064,174 @@ void ggml_compute_forward_conv_transpose_2d(
60646064
}
60656065
}
60666066

6067+
// ggml_compute_forward_depthwise_conv_2d
6068+
6069+
struct ggml_depthwise_conv_2d_params {
6070+
int64_t channels;
6071+
int64_t batch;
6072+
int64_t src_w;
6073+
int64_t src_h;
6074+
int64_t dst_w;
6075+
int64_t dst_h;
6076+
int64_t knl_w;
6077+
int64_t knl_h;
6078+
int stride_x;
6079+
int stride_y;
6080+
int pad_x;
6081+
int pad_y;
6082+
};
6083+
6084+
static void ggml_compute_forward_depthwise_conv_2d_cwhn(
6085+
const struct ggml_compute_params * params,
6086+
const struct ggml_tensor * src,
6087+
const struct ggml_tensor * kernel,
6088+
struct ggml_tensor * dst,
6089+
const struct ggml_depthwise_conv_2d_params p) {
6090+
6091+
const int64_t c = p.channels;
6092+
const float * knl_data = (const float *)kernel->data;
6093+
6094+
const int64_t rows_total = p.dst_h * p.batch;
6095+
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
6096+
const int64_t row_start = params->ith * rows_per_thread;
6097+
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
6098+
6099+
#ifdef GGML_SIMD
6100+
const int64_t pkg_size = GGML_F32_EPR;
6101+
const int64_t pkg_count = c / pkg_size;
6102+
const int64_t c_pkg_end = pkg_count * pkg_size;
6103+
#else
6104+
const int64_t c_pkg_end = 0;
6105+
#endif
6106+
6107+
for (int64_t row = row_start; row < row_end; ++row) {
6108+
const int64_t dst_y = row % p.dst_h;
6109+
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
6110+
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
6111+
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
6112+
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
6113+
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
6114+
6115+
#ifdef GGML_SIMD
6116+
// Vectorized loop
6117+
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
6118+
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
6119+
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
6120+
const int64_t src_y = src_y_base + knl_y;
6121+
if (src_y < 0 || src_y >= p.src_h) {
6122+
continue;
6123+
}
6124+
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
6125+
const int64_t src_x = src_x_base + knl_x;
6126+
if (src_x < 0 || src_x >= p.src_w) {
6127+
continue;
6128+
}
6129+
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
6130+
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
6131+
sum = GGML_F32_VEC_FMA(sum, k, s);
6132+
}
6133+
}
6134+
GGML_F32_VEC_STORE(dst_data + c_i, sum);
6135+
}
6136+
#endif
6137+
// Scalar loop
6138+
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
6139+
float sum = 0.0f;
6140+
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
6141+
const int64_t src_y = src_y_base + knl_y;
6142+
if (src_y < 0 || src_y >= p.src_h) {
6143+
continue;
6144+
}
6145+
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
6146+
const int64_t src_x = src_x_base + knl_x;
6147+
if (src_x < 0 || src_x >= p.src_w) {
6148+
continue;
6149+
}
6150+
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
6151+
* src_data[(src_y * p.src_w + src_x) * c + c_i];
6152+
}
6153+
}
6154+
dst_data[c_i] = sum;
6155+
}
6156+
}
6157+
}
6158+
}
6159+
6160+
static void ggml_compute_forward_depthwise_conv_2d_whcn(
6161+
const struct ggml_compute_params * params,
6162+
const struct ggml_tensor * src,
6163+
const struct ggml_tensor * kernel,
6164+
struct ggml_tensor * dst,
6165+
const struct ggml_depthwise_conv_2d_params p) {
6166+
6167+
const int64_t n = p.channels * p.batch;
6168+
const int64_t per_thread = (n + params->nth - 1) / params->nth;
6169+
const int64_t start = params->ith * per_thread;
6170+
const int64_t end = MIN(start + per_thread, n);
6171+
6172+
for (int64_t i = start; i < end; ++i) {
6173+
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
6174+
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
6175+
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
6176+
6177+
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
6178+
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
6179+
6180+
float sum = 0.0f;
6181+
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
6182+
const int64_t src_y = dst_y * p.stride_y + knl_y - p.pad_y;
6183+
if (src_y < 0 || src_y >= p.src_h) {
6184+
continue;
6185+
}
6186+
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
6187+
const int64_t src_x = dst_x * p.stride_x + knl_x - p.pad_x;
6188+
if (src_x < 0 || src_x >= p.src_w) {
6189+
continue;
6190+
}
6191+
sum += knl_data[knl_y * p.knl_w + knl_x]
6192+
* src_data[src_y * p.src_w + src_x];
6193+
}
6194+
}
6195+
dst_data[dst_y * p.dst_w + dst_x] = sum;
6196+
}
6197+
}
6198+
}
6199+
}
6200+
6201+
static void ggml_compute_forward_depthwise_conv_2d(
6202+
const struct ggml_compute_params * params,
6203+
struct ggml_tensor * dst) {
6204+
6205+
const struct ggml_tensor * kernel = dst->src[0];
6206+
const struct ggml_tensor * src = dst->src[1];
6207+
struct ggml_depthwise_conv_2d_params p;
6208+
p.channels = src->ne[2];
6209+
p.batch = src->ne[3];
6210+
p.src_w = src->ne[0];
6211+
p.src_h = src->ne[1];
6212+
p.dst_w = dst->ne[0];
6213+
p.dst_h = dst->ne[1];
6214+
p.knl_w = kernel->ne[0];
6215+
p.knl_h = kernel->ne[1];
6216+
p.stride_x = dst->op_params[0];
6217+
p.stride_y = dst->op_params[1];
6218+
p.pad_x = dst->op_params[2];
6219+
p.pad_y = dst->op_params[3];
6220+
6221+
GGML_ASSERT(kernel->ne[3] == p.channels);
6222+
GGML_ASSERT(dst->ne[3] == p.batch);
6223+
6224+
if (ggml_is_contiguous(src)) {
6225+
ggml_compute_forward_depthwise_conv_2d_whcn(params, src, kernel, dst, p);
6226+
} else if (ggml_is_contiguous_channels(src)) {
6227+
// kernel should also have channels most contiguous in memory
6228+
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
6229+
ggml_compute_forward_depthwise_conv_2d_cwhn(params, src, kernel, dst, p);
6230+
} else {
6231+
GGML_ABORT("non-contiguous memory layout not supported");
6232+
}
6233+
}
6234+
60676235
// ggml_compute_forward_pool_1d_sk_p0
60686236

60696237
static void ggml_compute_forward_pool_1d_sk_p0(

src/ggml-cpu/ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
6565
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
6666
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
6767
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
68+
void ggml_compute_forward_depthwise_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
6869
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
6970
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7071
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);

src/ggml.c

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
956956
"CONV_TRANSPOSE_1D",
957957
"IM2COL",
958958
"IM2COL_BACK",
959+
"DEPTHWISE_CONV_2D",
959960
"CONV_TRANSPOSE_2D",
960961
"POOL_1D",
961962
"POOL_2D",
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
993994
"OPT_STEP_ADAMW",
994995
};
995996

996-
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
997+
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
997998

998999
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
9991000
"none",
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
10501051
"conv_transpose_1d(x)",
10511052
"im2col(x)",
10521053
"im2col_back(x)",
1054+
"depthwise_conv_2d(x)",
10531055
"conv_transpose_2d(x)",
10541056
"pool_1d(x)",
10551057
"pool_2d(x)",
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
10871089
"adamw(x)",
10881090
};
10891091

1090-
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
1092+
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
10911093

10921094
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
10931095

@@ -1344,6 +1346,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
13441346
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
13451347
}
13461348

1349+
bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
1350+
return
1351+
tensor->nb[0] > tensor->nb[2] &&
1352+
tensor->nb[1] > tensor->nb[0] &&
1353+
tensor->nb[2] == ggml_type_size(tensor->type);
1354+
}
1355+
13471356
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
13481357
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
13491358

@@ -4050,6 +4059,44 @@ struct ggml_tensor * ggml_conv_2d_dw(
40504059
return result;
40514060
}
40524061

4062+
// ggml_depthwise_conv_2d
4063+
4064+
struct ggml_tensor * ggml_depthwise_conv_2d(
4065+
struct ggml_context * ctx,
4066+
struct ggml_tensor * a,
4067+
struct ggml_tensor * b,
4068+
int stride0,
4069+
int stride1,
4070+
int pad0,
4071+
int pad1) {
4072+
GGML_ASSERT(a->ne[2] == 1);
4073+
GGML_ASSERT(a->ne[3] == b->ne[2]);
4074+
int64_t ne[4];
4075+
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, 1);
4076+
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, 1);
4077+
ne[2] = b->ne[2];
4078+
ne[3] = b->ne[3];
4079+
4080+
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
4081+
4082+
if (ggml_is_contiguous_channels(b)) {
4083+
// Result will be permuted the same way as input (CWHN order)
4084+
const int64_t type_size = ggml_type_size(result->type);
4085+
GGML_ASSERT(ggml_blck_size(result->type) == 1);
4086+
result->nb[0] = result->ne[2] * type_size;
4087+
result->nb[1] = result->ne[0] * result->nb[0];
4088+
result->nb[2] = type_size;
4089+
}
4090+
4091+
int32_t params[] = { stride0, stride1, pad0, pad1 };
4092+
ggml_set_op_params(result, params, sizeof(params));
4093+
4094+
result->op = GGML_OP_DEPTHWISE_CONV_2D;
4095+
result->src[0] = a;
4096+
result->src[1] = b;
4097+
return result;
4098+
}
4099+
40534100
// ggml_conv_transpose_2d_p0
40544101

40554102
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {

tests/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,16 @@ add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
384384
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
385385

386386

387+
#
388+
# test-depthwise-conv2d
389+
390+
set(TEST_TARGET test-depthwise-conv2d)
391+
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
392+
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
393+
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
394+
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
395+
396+
387397
#
388398
# test-mul-mat
389399

0 commit comments

Comments
 (0)