@@ -1130,14 +1130,26 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
11301130 );
11311131 }
11321132 }
1133+ // if(threadIdx.x == 0 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){
1134+ // printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(acc_register_[3][0][0]), __half2float(acc_register_[3][0][1]),
1135+ // __half2float(acc_register_[3][0][2]), __half2float(acc_register_[3][0][3]));
1136+ // printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(A_register_[3][mma_k][0]), __half2float(A_register_[3][mma_k][1]),
1137+ // __half2float(A_register_[3][mma_k][2]), __half2float(A_register_[3][mma_k][3]));
1138+ // printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1]),
1139+ // __half2float(B_register_[mma_k][0][2]), __half2float(B_register_[mma_k][0][3]));
1140+ // }
1141+ // if(threadIdx.x < 4 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){
1142+ // printf("A %d, %d, %d: %f, %f \n", block_k, mma_k, threadIdx.x, __half2float(A_register_[3][mma_k][0]), __half2float(A_register_[3][mma_k][1]));
1143+ // printf("B %d, %d, %d: %f, %f \n", block_k, mma_k, threadIdx.x, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1]));
1144+ // }
11331145 }
1134- // if(threadIdx.x == 4 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){
1135- // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(acc_register_[0 ][0][0]), __half2float(acc_register_[0 ][0][1]),
1136- // __half2float(acc_register_[0 ][0][2]), __half2float(acc_register_[0 ][0][3]));
1137- // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(A_register_[0 ][0][0]), __half2float(A_register_[0 ][0][1]),
1138- // __half2float(A_register_[0 ][0][2]), __half2float(A_register_[0 ][0][3]));
1139- // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(B_register_[0 ][0][0]), __half2float(B_register_[0 ][0][1]),
1140- // __half2float(B_register_[0 ][0][2]), __half2float(B_register_[0 ][0][3]));
1146+ // if(threadIdx.x == 0 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){
1147+ // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(acc_register_[3 ][0][0]), __half2float(acc_register_[3 ][0][1]),
1148+ // __half2float(acc_register_[3 ][0][2]), __half2float(acc_register_[3 ][0][3]));
1149+ // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(A_register_[3 ][0][0]), __half2float(A_register_[3 ][0][1]),
1150+ // __half2float(A_register_[3 ][0][2]), __half2float(A_register_[3 ][0][3]));
1151+ // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(B_register_[3 ][0][0]), __half2float(B_register_[3 ][0][1]),
1152+ // __half2float(B_register_[3 ][0][2]), __half2float(B_register_[3 ][0][3]));
11411153 // }
11421154
11431155
0 commit comments