@@ -13,18 +13,18 @@ constexpr uint WARPSIZE = 32;
1313
1414
1515// currently not use; in future for split-k kernels
16- static __global__ void reduce_f32 (const float * __restrict__ x, float * __restrict__ dst, const int ncols, const int nrows) {
17- const int row = blockIdx .x ;
18- const int col = threadIdx .x ;
19-
20- float sum = 0 .0f ;
21- if (row * blockDim .x + col < ncols) {
22- for (int i = 0 ; i < nrows; ++i){
23- sum += x[i * ncols + row * blockDim .x + col];
24- }
25- dst[row * blockDim .x + col] = sum;
26- }
27- }
16+ // static __global__ void reduce_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols, const int nrows) {
17+ // const int row = blockIdx.x;
18+ // const int col = threadIdx.x;
19+
20+ // float sum = 0.0f;
21+ // if (row * blockDim.x + col < ncols) {
22+ // for (int i = 0; i < nrows; ++i){
23+ // sum += x[i * ncols + row * blockDim.x + col];
24+ // }
25+ // dst[row * blockDim.x + col] = sum;
26+ // }
27+ // }
2828
2929template <typename src_T, typename dst_T>
3030static __global__ void NCHW2NHWC (const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01){
@@ -1033,8 +1033,6 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
10331033 const uint OC = kernel->ne [3 ]; // ouptut_chanles
10341034 const uint B = input->ne [3 ]; // n_batches
10351035
1036- const int64_t total = B * OC * OH * OW;
1037-
10381036 param_t params = { B, IC, IH, IW, OC, KH, KW, ST_Y, ST_X, PD_Y, PD_X, DL_Y, DL_X, OH, OW };
10391037 params.SC_fastdiv = init_fastdiv_values (KW*IC);
10401038 params.OW_fastdiv = init_fastdiv_values (OW);
0 commit comments