Skip to content

Commit 2ccc67d

Browse files
author
bssrdf
committed
added source code for winograd kernel
1 parent 68c251b commit 2ccc67d

File tree

2 files changed

+891
-0
lines changed

2 files changed

+891
-0
lines changed

src/ggml-cuda/conv-winograd.cu

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#include "conv-transpose-1d.cuh"
2+
3+
static __global__ void conv_transpose_1d_kernel(
4+
const int s0, const int p0, const int d0, const int output_size,
5+
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
6+
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
7+
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
8+
const float * src0, const float * src1, float * dst) {
9+
int global_index = threadIdx.x + blockIdx.x * blockDim.x;
10+
if (global_index >= output_size) {
11+
return;
12+
}
13+
14+
int out_index = global_index / dst_ne0;
15+
16+
float accumulator = 0;
17+
18+
for (int c = 0; c < src0_ne2; c++) {
19+
int idx = global_index % dst_ne0;
20+
21+
int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
22+
int input_offset = src1_ne0 * c;
23+
24+
for (int i = 0; i < src1_ne0; i++) {
25+
if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
26+
continue;
27+
}
28+
int weight_idx = idx - i*s0;
29+
30+
float kernel_weight = src0[kernel_offset + weight_idx];
31+
float input_value = src1[input_offset+i];
32+
33+
accumulator += kernel_weight * input_value;
34+
}
35+
}
36+
dst[global_index] = accumulator;
37+
}
38+
39+
static void conv_transpose_1d_f32_f32_cuda(
40+
const int s0, const int p0, const int d0, const int output_size,
41+
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
42+
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
43+
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
44+
const float * src0, const float * src1, float * dst,
45+
cudaStream_t stream) {
46+
47+
const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
48+
conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
49+
s0,p0,d0,output_size,
50+
src0_ne0, src0_ne1, src0_ne2, src0_ne3,
51+
src1_ne0, src1_ne1, src1_ne2, src1_ne3,
52+
dst_ne0, dst_ne1, dst_ne2, dst_ne3,
53+
src0,src1, dst);
54+
}
55+
56+
57+
void ggml_cuda_op_winograd_stage0(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
58+
const ggml_tensor * src0 = dst->src[0];
59+
const float * src0_d = (const float *)src0->data;
60+
61+
const ggml_tensor * src1 = dst->src[1];
62+
const float * src1_d = (const float *)src1->data;
63+
64+
float * dst_d = (float *)dst->data;
65+
cudaStream_t stream = ctx.stream();
66+
67+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
68+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
69+
70+
GGML_ASSERT(ggml_is_contiguous(src0));
71+
GGML_ASSERT(ggml_is_contiguous(src1));
72+
73+
const int32_t * opts = (const int32_t *)dst->op_params;
74+
75+
const int s0 = opts[0];
76+
const int p0 = 0;//opts[3];
77+
const int d0 = 1;//opts[4];
78+
79+
const int64_t kernel_size = ggml_nelements(src0);
80+
const int64_t input_size = ggml_nelements(src1);
81+
const int64_t output_size = ggml_nelements(dst);
82+
83+
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
84+
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
85+
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
86+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
87+
src0_d, src1_d, dst_d, stream);
88+
}
89+
90+
91+
void ggml_cuda_op_winograd_stage1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
92+
const ggml_tensor * src0 = dst->src[0];
93+
const float * src0_d = (const float *)src0->data;
94+
95+
const ggml_tensor * src1 = dst->src[1];
96+
const float * src1_d = (const float *)src1->data;
97+
98+
float * dst_d = (float *)dst->data;
99+
cudaStream_t stream = ctx.stream();
100+
101+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
102+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
103+
104+
GGML_ASSERT(ggml_is_contiguous(src0));
105+
GGML_ASSERT(ggml_is_contiguous(src1));
106+
107+
const int32_t * opts = (const int32_t *)dst->op_params;
108+
109+
const int s0 = opts[0];
110+
const int p0 = 0;//opts[3];
111+
const int d0 = 1;//opts[4];
112+
113+
const int64_t kernel_size = ggml_nelements(src0);
114+
const int64_t input_size = ggml_nelements(src1);
115+
const int64_t output_size = ggml_nelements(dst);
116+
117+
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
118+
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
119+
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
120+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
121+
src0_d, src1_d, dst_d, stream);
122+
}
123+
124+

0 commit comments

Comments
 (0)