@@ -37,90 +37,6 @@ static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne
3737 cpy_1 (cx + x_offset, cdst + dst_offset);
3838}
3939
40-
41- template <typename T>
42- static __global__ void cpy_flt_transpose (const char * cx, char * cdst_direct, const int ne,
43- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
44- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
45- const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
46-
47- char * cdst = (cdst_indirect != nullptr ) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
48-
49- const T* src = reinterpret_cast <const T*>(cx);
50- T* dst = reinterpret_cast <T*>(cdst);
51-
52- const int64_t nmat = ne / (ne00 * ne01);
53- const int64_t n = ne00 * ne01;
54- int width = ne01;
55- int height = ne00;
56- int x = blockIdx .x * TILE_DIM + threadIdx .x ;
57- int y = blockIdx .y * TILE_DIM + threadIdx .y ;
58- int tx = blockIdx .y * TILE_DIM + threadIdx .x ; // transpose block offset
59- int ty = blockIdx .x * TILE_DIM + threadIdx .y ;
60-
61- __shared__ T tile[TILE_DIM][TILE_DIM];
62-
63- for (int i = 0 ; i < BLOCK_NM; ++i){
64-
65- const unsigned int imat = blockIdx .z * BLOCK_NM + i;
66- if (imat >= nmat)
67- break ;
68- for (int j = 0 ; j < TILE_DIM; j += BLOCK_ROWS){
69- // if(imat < nmat && x < width && y + j < height){
70- if (x < width && y + j < height){
71- const unsigned int idx = (y+j)*width + x;
72- const int row = threadIdx .y +j;
73- const int col = threadIdx .x ^ row;
74- // tile[threadIdx.y+j][threadIdx.x] = src[imat*n + idx];
75- tile[row][col] = src[imat*n + idx];
76- }
77- }
78- __syncthreads ();
79-
80-
81- // if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
82- // printf("BEGIN %d\n", i);
83- // for(int jj = 0; jj < TILE_DIM; ++jj){
84- // for(int ii = 0; ii < TILE_DIM; ++ii)
85- // printf("%.f, ", tile[jj][ii]);
86- // printf("]\n");
87- // }
88- // }
89-
90- for (int j = 0 ; j < TILE_DIM; j += BLOCK_ROWS){
91- // if(imat < nmat && ty + j < width && tx < height){
92- if (ty + j < width && tx < height){
93- const unsigned int idx = (ty+j)*height + tx;
94- const int col = (threadIdx .y +j) ^ threadIdx .x ;
95- // dst[imat*n + idx] = tile[threadIdx.x][threadIdx.y + j];
96- dst[imat*n + idx] = tile[threadIdx .x ][col];
97- // if(imat*n + idx == 4*ne00){
98- // printf("DEBUG: (%u, %u, %u, %u, %u), j=%d, tx=%d, ty=%d, imat=%u idx=%u dst[%u]=%.2f, %f\n",
99- // threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, blockIdx.z, j, tx, ty,
100- // imat, idx, imat*n + idx, dst[imat*n + idx], tile[threadIdx.x][threadIdx.y + j]);
101- // }
102- }
103- }
104- }
105-
106- // if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
107- // // for(int j = 0; j < 32; ++j){
108- // // j = 0;
109- // for(int i = 0; i < 32; ++i)
110- // // printf("%.2f, ", src[j*48+i]);
111- // // printf("%.2f, ", src[j*48+i]);
112- // printf("%.2f, ", __half2float(src[i]));
113- // printf("]\n");
114- // // }
115- // printf("==============================\n");
116- // // for(int j = 0; j < 32; ++j){
117- // for(int i = 0; i < 32; ++i)
118- // printf("%.2f, ", __half2float(dst[i]));
119- // printf("]\n");
120- // // }
121- // }
122- }
123-
12440static __device__ void cpy_blck_q8_0_f32 (const char * cxi, char * cdsti) {
12541 float * cdstf = (float *)(cdsti);
12642
@@ -228,28 +144,9 @@ static void ggml_cpy_flt_cuda(
228144 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
229145 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
230146
231- if constexpr ((std::is_same_v<src_t , half> && std::is_same_v<dst_t , half> ||
232- std::is_same_v<src_t , float > && std::is_same_v<dst_t , float >)
233- && transpose){
234- // printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
235- // printf("cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n", nb00, nb01, nb10, nb11);
236- // if (ne00 == ne11 && ne01 == ne10 && nb00 == nb11 && nb10 == nb01){ //transpose
237- // if (transpose) { //transpose
238- // printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
239- dim3 dimGrid ( (ne01 + TILE_DIM - 1 ) / TILE_DIM,
240- (ne00 + TILE_DIM - 1 ) / TILE_DIM,
241- (ne/(ne00*ne01) + BLOCK_NM - 1 ) / BLOCK_NM );
242- dim3 dimBlock (TILE_DIM, BLOCK_ROWS, 1 );
243- cpy_flt_transpose<dst_t ><<<dimGrid, dimBlock, 0 , stream>>> (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
244- } else { // other
245- const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
246- cpy_flt<cpy_1_flt<src_t , dst_t >><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>>
247- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
248- }
249- // } else{
250- // cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
251- // (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
252- // }
147+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
148+ cpy_flt<cpy_1_flt<src_t , dst_t >><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>>
149+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
253150}
254151
255152static void ggml_cpy_f32_q8_0_cuda (
@@ -435,11 +332,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
435332 CUDA_CHECK (cudaMemcpyAsync (src1_ddc, src0_ddc, ggml_nbytes (src0), cudaMemcpyDeviceToDevice, main_stream));
436333 }
437334 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
438- if (src0->op_params [10 ] == 999 ){
439- ggml_cpy_flt_cuda<float , float , true > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
440- } else {
441- ggml_cpy_flt_cuda<float , float , false > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
442- }
335+ ggml_cpy_flt_cuda<float , float , false > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
443336 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
444337 ggml_cpy_flt_cuda<float , nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
445338 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
@@ -470,11 +363,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
470363 } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
471364 ggml_cpy_q5_1_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
472365 } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
473- if (src0->op_params [10 ] == 999 ){
474- ggml_cpy_flt_cuda<half, half, true > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
475- } else {
476- ggml_cpy_flt_cuda<half, half, false > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
477- }
366+ ggml_cpy_flt_cuda<half, half, false > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
478367 } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
479368 ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
480369 } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
0 commit comments