Skip to content

Commit 75dde41

Browse files
committed
WIP: minor tweak
1 parent 3ea524e commit 75dde41

File tree

2 files changed

+29
-28
lines changed

2 files changed

+29
-28
lines changed

ggml/src/ggml-cuda/cpy.cu

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -51,25 +51,23 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co
5151

5252
const int64_t nmat = ne / (ne00 * ne01);
5353
const int64_t n = ne00 * ne01;
54-
// const int64_t n = ne01 * ne02;
5554
int width = ne01;
5655
int height = ne00;
5756
int x = blockIdx.x * TILE_DIM + threadIdx.x;
5857
int y = blockIdx.y * TILE_DIM + threadIdx.y;
5958
int tx = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
6059
int ty = blockIdx.x * TILE_DIM + threadIdx.y;
6160

62-
// __shared__ T tile[TILE_DIM * TILE_DIM];
6361
__shared__ T tile[TILE_DIM][TILE_DIM];
6462

6563
for(int i = 0; i < BLOCK_NM; ++i){
66-
__syncthreads();
6764

6865
const unsigned int imat = blockIdx.z * BLOCK_NM + i;
6966
if(imat >= nmat)
7067
break;
7168
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
72-
if(imat < nmat && x < width && y + j < height){
69+
// if(imat < nmat && x < width && y + j < height){
70+
if(x < width && y + j < height){
7371
const unsigned int idx = (y+j)*width + x;
7472
const int row = threadIdx.y+j;
7573
const int col = threadIdx.x ^ row;
@@ -90,10 +88,9 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co
9088
// }
9189

9290
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
93-
94-
if(imat < nmat && ty + j < width && tx < height){
91+
// if(imat < nmat && ty + j < width && tx < height){
92+
if(ty + j < width && tx < height){
9593
const unsigned int idx = (ty+j)*height + tx;
96-
// const int row = threadIdx.x;
9794
const int col = (threadIdx.y+j) ^ threadIdx.x;
9895
// dst[imat*n + idx] = tile[threadIdx.x][threadIdx.y + j];
9996
dst[imat*n + idx] = tile[threadIdx.x][col];
@@ -104,25 +101,24 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co
104101
// }
105102
}
106103
}
107-
// }
108104
}
109105

110-
if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
111-
// for(int j = 0; j < 32; ++j){
112-
// j = 0;
113-
for(int i = 0; i < 32; ++i)
114-
// printf("%.2f, ", src[j*48+i]);
115-
// printf("%.2f, ", src[j*48+i]);
116-
printf("%.2f, ", __half2float(src[i]));
117-
printf("]\n");
118-
// }
119-
printf("==============================\n");
120-
// for(int j = 0; j < 32; ++j){
121-
for(int i = 0; i < 32; ++i)
122-
printf("%.2f, ", __half2float(dst[i]));
123-
printf("]\n");
124-
// }
125-
}
106+
// if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
107+
// // for(int j = 0; j < 32; ++j){
108+
// // j = 0;
109+
// for(int i = 0; i < 32; ++i)
110+
// // printf("%.2f, ", src[j*48+i]);
111+
// // printf("%.2f, ", src[j*48+i]);
112+
// printf("%.2f, ", __half2float(src[i]));
113+
// printf("]\n");
114+
// // }
115+
// printf("==============================\n");
116+
// // for(int j = 0; j < 32; ++j){
117+
// for(int i = 0; i < 32; ++i)
118+
// printf("%.2f, ", __half2float(dst[i]));
119+
// printf("]\n");
120+
// // }
121+
// }
126122
}
127123

128124
static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
@@ -235,8 +231,8 @@ static void ggml_cpy_flt_cuda(
235231
if constexpr ((std::is_same_v<src_t, half> && std::is_same_v<dst_t, half> ||
236232
std::is_same_v<src_t, float> && std::is_same_v<dst_t, float>)
237233
&& transpose){
238-
printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
239-
printf("cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n", nb00, nb01, nb10, nb11);
234+
// printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
235+
// printf("cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n", nb00, nb01, nb10, nb11);
240236
// if (ne00 == ne11 && ne01 == ne10 && nb00 == nb11 && nb10 == nb01){ //transpose
241237
// if (transpose) { //transpose
242238
// printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);

tests/test-backend-ops.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6840,8 +6840,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
68406840
// test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
68416841
// test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
68426842
// test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
6843-
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {1, 0, 2, 3}, true));
6844-
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {1, 0, 2, 3}, false));
6843+
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}, true));
6844+
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}, false));
6845+
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}, true));
6846+
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}, false));
6847+
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}, true));
6848+
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}, false));
6849+
68456850

68466851
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
68476852
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));

0 commit comments

Comments
 (0)