Skip to content

Commit 0d5d3df

Browse files
committed
ggml-cpu : kernels for faster depthwise 2D convolution
1 parent ffed2f0 commit 0d5d3df

File tree

5 files changed

+394
-3
lines changed

5 files changed

+394
-3
lines changed

include/ggml.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,7 @@ extern "C" {
480480
GGML_OP_CONV_TRANSPOSE_1D,
481481
GGML_OP_IM2COL,
482482
GGML_OP_IM2COL_BACK,
483+
GGML_OP_DEPTHWISE_CONV_2D,
483484
GGML_OP_CONV_TRANSPOSE_2D,
484485
GGML_OP_POOL_1D,
485486
GGML_OP_POOL_2D,
@@ -680,6 +681,9 @@ extern "C" {
680681
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
681682
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
682683

684+
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
685+
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
686+
683687
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
684688
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
685689

@@ -1651,7 +1655,7 @@ extern "C" {
16511655
struct ggml_tensor * a,
16521656
struct ggml_tensor * b);
16531657

1654-
// depthwise
1658+
// depthwise (via im2col and mul_mat)
16551659
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
16561660
struct ggml_context * ctx,
16571661
struct ggml_tensor * a, // convolution kernel
@@ -1663,6 +1667,19 @@ extern "C" {
16631667
int d0, // dilation dimension 0
16641668
int d1); // dilation dimension 1
16651669

1670+
// Depthwise 2D convolution
1671+
// a: KW KH 1 C convolution kernel
1672+
// b: W H C N input data
1673+
// res: W_out H_out C N
1674+
GGML_API struct ggml_tensor * ggml_depthwise_conv_2d(
1675+
struct ggml_context * ctx,
1676+
struct ggml_tensor * a,
1677+
struct ggml_tensor * b,
1678+
int stride0,
1679+
int stride1,
1680+
int pad0,
1681+
int pad1);
1682+
16661683
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
16671684
struct ggml_context * ctx,
16681685
struct ggml_tensor * a,

src/ggml-cpu/ggml-cpu.c

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11582,6 +11582,174 @@ static void ggml_compute_forward_conv_transpose_2d(
1158211582
}
1158311583
}
1158411584

11585+
// ggml_compute_forward_depthwise_conv_2d
11586+
11587+
struct ggml_depthwise_conv_2d_params {
11588+
int64_t channels;
11589+
int64_t batch;
11590+
int64_t src_w;
11591+
int64_t src_h;
11592+
int64_t dst_w;
11593+
int64_t dst_h;
11594+
int64_t knl_w;
11595+
int64_t knl_h;
11596+
int stride_x;
11597+
int stride_y;
11598+
int pad_x;
11599+
int pad_y;
11600+
};
11601+
11602+
static void ggml_compute_forward_depthwise_conv_2d_cwhn(
11603+
const struct ggml_compute_params * params,
11604+
const struct ggml_tensor * src,
11605+
const struct ggml_tensor * kernel,
11606+
struct ggml_tensor * dst,
11607+
const struct ggml_depthwise_conv_2d_params p) {
11608+
11609+
const int64_t c = p.channels;
11610+
const float * knl_data = (const float *)kernel->data;
11611+
11612+
const int64_t rows_total = p.dst_h * p.batch;
11613+
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
11614+
const int64_t row_start = params->ith * rows_per_thread;
11615+
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
11616+
11617+
#ifdef GGML_SIMD
11618+
const int64_t pkg_size = GGML_F32_EPR;
11619+
const int64_t pkg_count = c / pkg_size;
11620+
const int64_t c_pkg_end = pkg_count * pkg_size;
11621+
#else
11622+
const int64_t c_pkg_end = 0;
11623+
#endif
11624+
11625+
for (int64_t row = row_start; row < row_end; ++row) {
11626+
const int64_t dst_y = row % p.dst_h;
11627+
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
11628+
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
11629+
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
11630+
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
11631+
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
11632+
11633+
#ifdef GGML_SIMD
11634+
// Vectorized loop
11635+
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
11636+
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
11637+
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
11638+
const int64_t src_y = src_y_base + knl_y;
11639+
if (src_y < 0 || src_y >= p.src_h) {
11640+
continue;
11641+
}
11642+
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
11643+
const int64_t src_x = src_x_base + knl_x;
11644+
if (src_x < 0 || src_x >= p.src_w) {
11645+
continue;
11646+
}
11647+
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
11648+
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
11649+
sum = GGML_F32_VEC_FMA(sum, k, s);
11650+
}
11651+
}
11652+
GGML_F32_VEC_STORE(dst_data + c_i, sum);
11653+
}
11654+
#endif
11655+
// Scalar loop
11656+
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
11657+
float sum = 0.0f;
11658+
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
11659+
const int64_t src_y = src_y_base + knl_y;
11660+
if (src_y < 0 || src_y >= p.src_h) {
11661+
continue;
11662+
}
11663+
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
11664+
const int64_t src_x = src_x_base + knl_x;
11665+
if (src_x < 0 || src_x >= p.src_w) {
11666+
continue;
11667+
}
11668+
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
11669+
* src_data[(src_y * p.src_w + src_x) * c + c_i];
11670+
}
11671+
}
11672+
dst_data[c_i] = sum;
11673+
}
11674+
}
11675+
}
11676+
}
11677+
11678+
static void ggml_compute_forward_depthwise_conv_2d_whcn(
11679+
const struct ggml_compute_params * params,
11680+
const struct ggml_tensor * src,
11681+
const struct ggml_tensor * kernel,
11682+
struct ggml_tensor * dst,
11683+
const struct ggml_depthwise_conv_2d_params p) {
11684+
11685+
const int64_t n = p.channels * p.batch;
11686+
const int64_t per_thread = (n + params->nth - 1) / params->nth;
11687+
const int64_t start = params->ith * per_thread;
11688+
const int64_t end = MIN(start + per_thread, n);
11689+
11690+
for (int64_t i = start; i < end; ++i) {
11691+
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
11692+
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
11693+
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
11694+
11695+
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
11696+
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
11697+
11698+
float sum = 0.0f;
11699+
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
11700+
const int64_t src_y = dst_y * p.stride_y + knl_y - p.pad_y;
11701+
if (src_y < 0 || src_y >= p.src_h) {
11702+
continue;
11703+
}
11704+
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
11705+
const int64_t src_x = dst_x * p.stride_x + knl_x - p.pad_x;
11706+
if (src_x < 0 || src_x >= p.src_w) {
11707+
continue;
11708+
}
11709+
sum += knl_data[knl_y * p.knl_w + knl_x]
11710+
* src_data[src_y * p.src_w + src_x];
11711+
}
11712+
}
11713+
dst_data[dst_y * p.dst_w + dst_x] = sum;
11714+
}
11715+
}
11716+
}
11717+
}
11718+
11719+
static void ggml_compute_forward_depthwise_conv_2d(
11720+
const struct ggml_compute_params * params,
11721+
struct ggml_tensor * dst) {
11722+
11723+
const struct ggml_tensor * kernel = dst->src[0];
11724+
const struct ggml_tensor * src = dst->src[1];
11725+
struct ggml_depthwise_conv_2d_params p;
11726+
p.channels = src->ne[2];
11727+
p.batch = src->ne[3];
11728+
p.src_w = src->ne[0];
11729+
p.src_h = src->ne[1];
11730+
p.dst_w = dst->ne[0];
11731+
p.dst_h = dst->ne[1];
11732+
p.knl_w = kernel->ne[0];
11733+
p.knl_h = kernel->ne[1];
11734+
p.stride_x = dst->op_params[0];
11735+
p.stride_y = dst->op_params[1];
11736+
p.pad_x = dst->op_params[2];
11737+
p.pad_y = dst->op_params[3];
11738+
11739+
GGML_ASSERT(kernel->ne[3] == p.channels);
11740+
GGML_ASSERT(dst->ne[3] == p.batch);
11741+
11742+
if (ggml_is_contiguous(src)) {
11743+
ggml_compute_forward_depthwise_conv_2d_whcn(params, src, kernel, dst, p);
11744+
} else if (ggml_is_contiguous_channels(src)) {
11745+
// kernel should also have channels most contiguous in memory
11746+
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
11747+
ggml_compute_forward_depthwise_conv_2d_cwhn(params, src, kernel, dst, p);
11748+
} else {
11749+
GGML_ABORT("non-contiguous memory layout not supported");
11750+
}
11751+
}
11752+
1158511753
// ggml_compute_forward_pool_1d_sk_p0
1158611754

1158711755
static void ggml_compute_forward_pool_1d_sk_p0(
@@ -14266,6 +14434,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1426614434
{
1426714435
ggml_compute_forward_im2col_back_f32(params, tensor);
1426814436
} break;
14437+
case GGML_OP_DEPTHWISE_CONV_2D:
14438+
{
14439+
ggml_compute_forward_depthwise_conv_2d(params, tensor);
14440+
} break;
1426914441
case GGML_OP_CONV_TRANSPOSE_2D:
1427014442
{
1427114443
ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -14627,6 +14799,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
1462714799
} break;
1462814800
case GGML_OP_IM2COL:
1462914801
case GGML_OP_IM2COL_BACK:
14802+
case GGML_OP_DEPTHWISE_CONV_2D:
1463014803
case GGML_OP_CONV_TRANSPOSE_1D:
1463114804
case GGML_OP_CONV_TRANSPOSE_2D:
1463214805
{

src/ggml.c

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
955955
"CONV_TRANSPOSE_1D",
956956
"IM2COL",
957957
"IM2COL_BACK",
958+
"DEPTHWISE_CONV_2D",
958959
"CONV_TRANSPOSE_2D",
959960
"POOL_1D",
960961
"POOL_2D",
@@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
996997
"OPT_STEP_ADAMW",
997998
};
998999

999-
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
1000+
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
10001001

10011002
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
10021003
"none",
@@ -1052,6 +1053,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
10521053
"conv_transpose_1d(x)",
10531054
"im2col(x)",
10541055
"im2col_back(x)",
1056+
"depthwise_conv_2d(x)",
10551057
"conv_transpose_2d(x)",
10561058
"pool_1d(x)",
10571059
"pool_2d(x)",
@@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
10931095
"adamw(x)",
10941096
};
10951097

1096-
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
1098+
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
10971099

10981100
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
10991101

@@ -1344,6 +1346,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
13441346
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
13451347
}
13461348

1349+
bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
1350+
return
1351+
tensor->nb[0] > tensor->nb[2] &&
1352+
tensor->nb[1] > tensor->nb[0] &&
1353+
tensor->nb[2] == ggml_type_size(tensor->type);
1354+
}
1355+
13471356
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
13481357
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
13491358

@@ -4019,6 +4028,44 @@ struct ggml_tensor * ggml_conv_2d_dw(
40194028
return result;
40204029
}
40214030

4031+
// ggml_depthwise_conv_2d
4032+
4033+
struct ggml_tensor * ggml_depthwise_conv_2d(
4034+
struct ggml_context * ctx,
4035+
struct ggml_tensor * a,
4036+
struct ggml_tensor * b,
4037+
int stride0,
4038+
int stride1,
4039+
int pad0,
4040+
int pad1) {
4041+
GGML_ASSERT(a->ne[2] == 1);
4042+
GGML_ASSERT(a->ne[3] == b->ne[2]);
4043+
int64_t ne[4];
4044+
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, 1);
4045+
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, 1);
4046+
ne[2] = b->ne[2];
4047+
ne[3] = b->ne[3];
4048+
4049+
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
4050+
4051+
if (ggml_is_contiguous_channels(b)) {
4052+
// Result will be permuted the same way as input (CWHN order)
4053+
const int64_t type_size = ggml_type_size(result->type);
4054+
GGML_ASSERT(ggml_blck_size(result->type) == 1);
4055+
result->nb[0] = result->ne[2] * type_size;
4056+
result->nb[1] = result->ne[0] * result->nb[0];
4057+
result->nb[2] = type_size;
4058+
}
4059+
4060+
int32_t params[] = { stride0, stride1, pad0, pad1 };
4061+
ggml_set_op_params(result, params, sizeof(params));
4062+
4063+
result->op = GGML_OP_DEPTHWISE_CONV_2D;
4064+
result->src[0] = a;
4065+
result->src[1] = b;
4066+
return result;
4067+
}
4068+
40224069
// ggml_conv_transpose_2d_p0
40234070

40244071
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {

tests/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,16 @@ add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
384384
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
385385

386386

387+
#
388+
# test-depthwise-conv2d
389+
390+
set(TEST_TARGET test-depthwise-conv2d)
391+
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
392+
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
393+
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
394+
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
395+
396+
387397
#
388398
# test-mul-mat
389399

0 commit comments

Comments
 (0)