Skip to content

Commit a3b4d8d

Browse files
committed
clean up
1 parent 55859a8 commit a3b4d8d

File tree

6 files changed

+5
-912
lines changed

6 files changed

+5
-912
lines changed

ggml/src/ggml-cuda/cpy.cu

Lines changed: 5 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -37,90 +37,6 @@ static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne
3737
cpy_1(cx + x_offset, cdst + dst_offset);
3838
}
3939

40-
41-
template <typename T>
42-
static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, const int ne,
43-
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
44-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
45-
const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
46-
47-
char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
48-
49-
const T* src = reinterpret_cast<const T*>(cx);
50-
T* dst = reinterpret_cast<T*>(cdst);
51-
52-
const int64_t nmat = ne / (ne00 * ne01);
53-
const int64_t n = ne00 * ne01;
54-
int width = ne01;
55-
int height = ne00;
56-
int x = blockIdx.x * TILE_DIM + threadIdx.x;
57-
int y = blockIdx.y * TILE_DIM + threadIdx.y;
58-
int tx = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
59-
int ty = blockIdx.x * TILE_DIM + threadIdx.y;
60-
61-
__shared__ T tile[TILE_DIM][TILE_DIM];
62-
63-
for(int i = 0; i < BLOCK_NM; ++i){
64-
65-
const unsigned int imat = blockIdx.z * BLOCK_NM + i;
66-
if(imat >= nmat)
67-
break;
68-
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
69-
// if(imat < nmat && x < width && y + j < height){
70-
if(x < width && y + j < height){
71-
const unsigned int idx = (y+j)*width + x;
72-
const int row = threadIdx.y+j;
73-
const int col = threadIdx.x ^ row;
74-
// tile[threadIdx.y+j][threadIdx.x] = src[imat*n + idx];
75-
tile[row][col] = src[imat*n + idx];
76-
}
77-
}
78-
__syncthreads();
79-
80-
81-
// if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
82-
// printf("BEGIN %d\n", i);
83-
// for(int jj = 0; jj < TILE_DIM; ++jj){
84-
// for(int ii = 0; ii < TILE_DIM; ++ii)
85-
// printf("%.f, ", tile[jj][ii]);
86-
// printf("]\n");
87-
// }
88-
// }
89-
90-
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
91-
// if(imat < nmat && ty + j < width && tx < height){
92-
if(ty + j < width && tx < height){
93-
const unsigned int idx = (ty+j)*height + tx;
94-
const int col = (threadIdx.y+j) ^ threadIdx.x;
95-
// dst[imat*n + idx] = tile[threadIdx.x][threadIdx.y + j];
96-
dst[imat*n + idx] = tile[threadIdx.x][col];
97-
// if(imat*n + idx == 4*ne00){
98-
// printf("DEBUG: (%u, %u, %u, %u, %u), j=%d, tx=%d, ty=%d, imat=%u idx=%u dst[%u]=%.2f, %f\n",
99-
// threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, blockIdx.z, j, tx, ty,
100-
// imat, idx, imat*n + idx, dst[imat*n + idx], tile[threadIdx.x][threadIdx.y + j]);
101-
// }
102-
}
103-
}
104-
}
105-
106-
// if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
107-
// // for(int j = 0; j < 32; ++j){
108-
// // j = 0;
109-
// for(int i = 0; i < 32; ++i)
110-
// // printf("%.2f, ", src[j*48+i]);
111-
// // printf("%.2f, ", src[j*48+i]);
112-
// printf("%.2f, ", __half2float(src[i]));
113-
// printf("]\n");
114-
// // }
115-
// printf("==============================\n");
116-
// // for(int j = 0; j < 32; ++j){
117-
// for(int i = 0; i < 32; ++i)
118-
// printf("%.2f, ", __half2float(dst[i]));
119-
// printf("]\n");
120-
// // }
121-
// }
122-
}
123-
12440
static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
12541
float * cdstf = (float *)(cdsti);
12642

@@ -228,28 +144,9 @@ static void ggml_cpy_flt_cuda(
228144
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
229145
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
230146

231-
if constexpr ((std::is_same_v<src_t, half> && std::is_same_v<dst_t, half> ||
232-
std::is_same_v<src_t, float> && std::is_same_v<dst_t, float>)
233-
&& transpose){
234-
// printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
235-
// printf("cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n", nb00, nb01, nb10, nb11);
236-
// if (ne00 == ne11 && ne01 == ne10 && nb00 == nb11 && nb10 == nb01){ //transpose
237-
// if (transpose) { //transpose
238-
// printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
239-
dim3 dimGrid( (ne01 + TILE_DIM - 1) / TILE_DIM,
240-
(ne00 + TILE_DIM - 1) / TILE_DIM,
241-
(ne/(ne00*ne01) + BLOCK_NM - 1) / BLOCK_NM );
242-
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
243-
cpy_flt_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
244-
} else{ // other
245-
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
246-
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
247-
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
248-
}
249-
// } else{
250-
// cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
251-
// (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
252-
// }
147+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
148+
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
149+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
253150
}
254151

255152
static void ggml_cpy_f32_q8_0_cuda(
@@ -435,11 +332,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
435332
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
436333
}
437334
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
438-
if(src0->op_params[10] == 999){
439-
ggml_cpy_flt_cuda<float, float, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
440-
} else {
441-
ggml_cpy_flt_cuda<float, float, false> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
442-
}
335+
ggml_cpy_flt_cuda<float, float, false> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
443336
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
444337
ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
445338
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
@@ -470,11 +363,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
470363
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
471364
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
472365
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
473-
if(src0->op_params[10] == 999){
474-
ggml_cpy_flt_cuda<half, half, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
475-
} else {
476-
ggml_cpy_flt_cuda<half, half, false> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
477-
}
366+
ggml_cpy_flt_cuda<half, half, false> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
478367
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
479368
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
480369
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {

ggml/src/ggml-cuda/cpy.cuh

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,6 @@
22

33
#define CUDA_CPY_BLOCK_SIZE 64
44

5-
const int TILE_DIM = 32;
6-
const int BLOCK_ROWS = 8;
7-
const int BLOCK_NM = 8;
8-
9-
105
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false);
116

127
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3612,7 +3612,6 @@ struct ggml_tensor * ggml_transpose(
36123612

36133613
result->op = GGML_OP_TRANSPOSE;
36143614
result->src[0] = a;
3615-
result->op_params[10] = 999; // the transpose flag
36163615

36173616
return result;
36183617
}

tests/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,6 @@ if (NOT LLAMA_SANITIZE_ADDRESS)
198198
endif()
199199
llama_build_and_test(test-gguf.cpp)
200200
llama_build_and_test(test-backend-ops.cpp)
201-
llama_build_and_test(test-conv2d-implicit.cpp)
202-
llama_build_and_test(test-transpose.cpp)
203201

204202
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
205203
llama_build_and_test(test-autorelease.cpp LABEL "model")

0 commit comments

Comments
 (0)