@@ -51,25 +51,23 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co
5151
5252 const int64_t nmat = ne / (ne00 * ne01);
5353 const int64_t n = ne00 * ne01;
54- // const int64_t n = ne01 * ne02;
5554 int width = ne01;
5655 int height = ne00;
5756 int x = blockIdx .x * TILE_DIM + threadIdx .x ;
5857 int y = blockIdx .y * TILE_DIM + threadIdx .y ;
5958 int tx = blockIdx .y * TILE_DIM + threadIdx .x ; // transpose block offset
6059 int ty = blockIdx .x * TILE_DIM + threadIdx .y ;
6160
62- // __shared__ T tile[TILE_DIM * TILE_DIM];
6361 __shared__ T tile[TILE_DIM][TILE_DIM];
6462
6563 for (int i = 0 ; i < BLOCK_NM; ++i){
66- __syncthreads ();
6764
6865 const unsigned int imat = blockIdx .z * BLOCK_NM + i;
6966 if (imat >= nmat)
7067 break ;
7168 for (int j = 0 ; j < TILE_DIM; j += BLOCK_ROWS){
72- if (imat < nmat && x < width && y + j < height){
69+ // if(imat < nmat && x < width && y + j < height){
70+ if (x < width && y + j < height){
7371 const unsigned int idx = (y+j)*width + x;
7472 const int row = threadIdx .y +j;
7573 const int col = threadIdx .x ^ row;
@@ -90,10 +88,9 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co
9088 // }
9189
9290 for (int j = 0 ; j < TILE_DIM; j += BLOCK_ROWS){
93-
94- if (imat < nmat && ty + j < width && tx < height){
91+ // if(imat < nmat && ty + j < width && tx < height){
92+ if (ty + j < width && tx < height){
9593 const unsigned int idx = (ty+j)*height + tx;
96- // const int row = threadIdx.x;
9794 const int col = (threadIdx .y +j) ^ threadIdx .x ;
9895 // dst[imat*n + idx] = tile[threadIdx.x][threadIdx.y + j];
9996 dst[imat*n + idx] = tile[threadIdx .x ][col];
@@ -104,25 +101,24 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co
104101 // }
105102 }
106103 }
107- // }
108104 }
109105
110- if (threadIdx .x == 0 && threadIdx .y == 0 && blockIdx .x == 0 && blockIdx .y == 0 && blockIdx .z == 0 ){
111- // for(int j = 0; j < 32; ++j){
112- // j = 0;
113- for (int i = 0 ; i < 32 ; ++i)
114- // printf("%.2f, ", src[j*48+i]);
115- // printf("%.2f, ", src[j*48+i]);
116- printf (" %.2f, " , __half2float (src[i]));
117- printf (" ]\n " );
118- // }
119- printf (" ==============================\n " );
120- // for(int j = 0; j < 32; ++j){
121- for (int i = 0 ; i < 32 ; ++i)
122- printf (" %.2f, " , __half2float (dst[i]));
123- printf (" ]\n " );
124- // }
125- }
106+ // if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
107+ // // for(int j = 0; j < 32; ++j){
108+ // // j = 0;
109+ // for(int i = 0; i < 32; ++i)
110+ // // printf("%.2f, ", src[j*48+i]);
111+ // // printf("%.2f, ", src[j*48+i]);
112+ // printf("%.2f, ", __half2float(src[i]));
113+ // printf("]\n");
114+ // // }
115+ // printf("==============================\n");
116+ // // for(int j = 0; j < 32; ++j){
117+ // for(int i = 0; i < 32; ++i)
118+ // printf("%.2f, ", __half2float(dst[i]));
119+ // printf("]\n");
120+ // // }
121+ // }
126122}
127123
128124static __device__ void cpy_blck_q8_0_f32 (const char * cxi, char * cdsti) {
@@ -235,8 +231,8 @@ static void ggml_cpy_flt_cuda(
235231 if constexpr ((std::is_same_v<src_t , half> && std::is_same_v<dst_t , half> ||
236232 std::is_same_v<src_t , float > && std::is_same_v<dst_t , float >)
237233 && transpose){
238- printf (" cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n " , ne, ne00, ne01, ne10, ne11);
239- printf (" cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n " , nb00, nb01, nb10, nb11);
234+ // printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
235+ // printf("cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n", nb00, nb01, nb10, nb11);
240236 // if (ne00 == ne11 && ne01 == ne10 && nb00 == nb11 && nb10 == nb01){ //transpose
241237 // if (transpose) { //transpose
242238 // printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
0 commit comments