Skip to content

Commit 396f558

Browse files
committed
WIP: bug fix
1 parent 610e41a commit 396f558

File tree

2 files changed

+33
-19
lines changed

2 files changed

+33
-19
lines changed

ggml/src/ggml-cuda/conv2d-implicit.cu

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,7 +1076,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
10761076
// prefetch the first block tile of A,B into shared memory
10771077
// half* A_block_gmem = input + (block_m * BM * A_stride);
10781078
const half* A_block_gmem = input;
1079-
const half* B_block_gmem = kernel + (block_n * weightKOffset);
1079+
// const half* B_block_gmem = kernel + (block_n * weightKOffset);
1080+
const half* B_block_gmem = kernel + block_n * BN * weightKOffset;
10801081
tileMemcpySwizzleA<BM, NUM_THREADS>(A_block_gmem, A_block_smem, inChannelOffset, param);
10811082
tileMemcpySwizzleB<BN, NUM_THREADS>(B_block_gmem, B_block_smem, weightKOffset, param);
10821083

@@ -1097,7 +1098,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
10971098
{
10981099
// half* A_block_gmem = A + (block_m * BM * A_stride) + (block_k * BK);
10991100
const half* A_block_gmem = input;
1100-
const half* B_block_gmem = kernel + (block_n * weightKOffset);
1101+
// const half* B_block_gmem = kernel + (block_n * weightKOffset);
1102+
const half* B_block_gmem = kernel + (block_n * BN * weightKOffset);
11011103
tileMemcpyLoadA<BM, BK, NUM_THREADS, 4>(A_block_gmem, A_gmem_cache_reg, block_k * BK, inChannelOffset, param);
11021104
tileMemcpyLoadB<BN, BK, NUM_THREADS, 4>(B_block_gmem, B_gmem_cache_reg, block_k * BK, weightKOffset, param);
11031105
}
@@ -1119,6 +1121,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
11191121
{
11201122
asm volatile (
11211123
"mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
1124+
// "mma.sync.aligned.m16n8k8.row.row.f16.f16.f16.f16 "
11221125
"{%0, %1}, "
11231126
"{%2, %3}, "
11241127
"{%4}, "
@@ -1130,14 +1133,14 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
11301133
);
11311134
}
11321135
}
1133-
// if(threadIdx.x == 0 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){
1134-
// printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(acc_register_[3][0][0]), __half2float(acc_register_[3][0][1]),
1135-
// __half2float(acc_register_[3][0][2]), __half2float(acc_register_[3][0][3]));
1136-
// printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(A_register_[3][mma_k][0]), __half2float(A_register_[3][mma_k][1]),
1137-
// __half2float(A_register_[3][mma_k][2]), __half2float(A_register_[3][mma_k][3]));
1138-
// printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1]),
1139-
// __half2float(B_register_[mma_k][0][2]), __half2float(B_register_[mma_k][0][3]));
1140-
// }
1136+
if(threadIdx.x == 28 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){
1137+
printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(acc_register_[0][0][0]), __half2float(acc_register_[0][0][1]),
1138+
__half2float(acc_register_[0][0][2]), __half2float(acc_register_[0][0][3]));
1139+
printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(A_register_[0][mma_k][0]), __half2float(A_register_[0][mma_k][1]),
1140+
__half2float(A_register_[0][mma_k][2]), __half2float(A_register_[0][mma_k][3]));
1141+
printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1]),
1142+
__half2float(B_register_[mma_k][0][2]), __half2float(B_register_[mma_k][0][3]));
1143+
}
11411144
// if(threadIdx.x < 4 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){
11421145
// printf("A %d, %d, %d: %f, %f \n", block_k, mma_k, threadIdx.x, __half2float(A_register_[3][mma_k][0]), __half2float(A_register_[3][mma_k][1]));
11431146
// printf("B %d, %d, %d: %f, %f \n", block_k, mma_k, threadIdx.x, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1]));

tests/test-conv2d-implicit.cpp

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,18 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
4242
// create data
4343
int KW = 3, KH = 3, IC = ic, OC = oc;
4444
int IW = iw, IH = ih, N = 1;
45+
srand(time(NULL));
4546

4647
// printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH);
4748

4849
// Initialize adata
4950
std::vector<float> adata(KW * KH * IC * OC);
5051
for (int i = 0; i < KW * KH * IC * OC; i++) {
51-
adata[i] = 0.2f;
52+
// adata[i] = 2.f;
53+
adata[i] = (float)(i%KW)-1.f;
54+
// adata[i] = (rand() % 255) / 255.0;
55+
// float r = -1.f + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(1.f-(-1.f))));
56+
// adata[i] = r;
5257
}
5358

5459
// Convert adata to fp16 format
@@ -58,7 +63,11 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
5863
// Initialize bdata
5964
std::vector<float> bdata(IW * IH * IC * N);
6065
for (int i = 0; i < IW * IH * IC * N; i++) {
61-
bdata[i] = 1.5f;
66+
bdata[i] = (float)(i%IW)/10.f;
67+
// bdata[i] = 1.5f;
68+
// bdata[i] = (rand() % 255) / 255.0;
69+
// float r = -1.f + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(1.f-(-1.f))));
70+
// bdata[i] = r;
6271
}
6372

6473
size_t buffer_size = 0;
@@ -344,7 +353,7 @@ int main(void)
344353
// std::make_tuple(640,640,52,76),
345354
// std::make_tuple(640,640,104,152),
346355
// std::make_tuple(960,320,104,152),
347-
std::make_tuple(128,128,26,38),
356+
std::make_tuple(640,128,26,38),
348357
// std::make_tuple(1280,640,52,76),
349358
// std::make_tuple(1920,1280,26,38),
350359
// std::make_tuple(2560,1280,26,38),
@@ -378,7 +387,7 @@ int main(void)
378387
int iterations = 0;
379388

380389
double run_time0;
381-
std::vector<float> conv2d_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);
390+
std::vector<float> im2col_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);
382391

383392
ggml_gallocr_free(allocr);
384393

@@ -399,7 +408,7 @@ int main(void)
399408

400409
double run_time1;
401410
// std::vector<float> wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
402-
conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
411+
std::vector<float> conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
403412

404413

405414
ggml_gallocr_free(allocr);
@@ -439,11 +448,13 @@ int main(void)
439448
// for(int i = 0; i < ggml_nelements(wino_res); i++) {
440449
for(int i = 0; i < 26*38; i++) {
441450
// for(int i = 0; i < conv2d_data.size(); i++) {
442-
float diff = fabs(conv2d_data[i] - wino_data[i]);
451+
// float diff = fabs(conv2d_data[i] - wino_data[i]);
452+
float diff = fabs(im2col_data[i] - wino_data[i]);
453+
float diff1 = fabs(im2col_data[i] - conv2d_data[i]);
443454
// if(diff > 1.e-4) {
444-
printf("(%f, %f, %f, %d) \n",
445-
conv2d_data[i],
446-
wino_data[i], diff, i);
455+
printf("(%f, %f, %f, %f, %f, %d) \n",
456+
im2col_data[i], conv2d_data[i],
457+
wino_data[i], diff, diff1, i);
447458
// break;
448459
// }
449460
}

0 commit comments

Comments
 (0)