@@ -2148,62 +2148,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
21482148 }
21492149}
21502150
2151- static int ggml_cuda_mul_mat_q (ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
2152- const ggml_cgraph * cgraph, int node_n, bool is_gemv) {
2153-
2154- auto stream = ctx.stream ();
2155-
2156- auto ne10_padded = GGML_PAD (src1->ne [0 ], MATRIX_ROW_PADDING);
2157- auto nb10_padded = ne10_padded*sizeof (block_q8_1)/QK8_1;
2158- auto quantized_size = nb10_padded*ggml_nrows (src1);
2159- if (!is_gemv) {
2160- quantized_size += get_mmq_x_max_host (ggml_cuda_info ().devices [ctx.device ].cc )*sizeof (block_q8_1_mmq);
2161- }
2162- ggml_cuda_pool_alloc<char > src1_quantized (ctx.pool (), quantized_size);
2163- if (is_gemv) {
2164- quantize_row_q8_1_cuda ((const float *)src1->data , (void *)src1_quantized.get (), src1->ne [0 ], src1->ne [1 ], src1->ne [2 ], ne10_padded,
2165- src0->type , stream);
2166- CUDA_CHECK (cudaGetLastError ());
2167-
2168- ggml_cuda_op_mul_mat_vec_q (ctx, src0, src1, dst, (const char *)src0->data , nullptr , src1_quantized.get (), (float *)dst->data ,
2169- 0 , src0->ne [1 ], src1->ne [1 ], ne10_padded, stream);
2170- CUDA_CHECK (cudaGetLastError ());
2171- } else {
2172- quantize_mmq_q8_1_cuda ((const float *)src1->data , src1_quantized.get (), src1->ne [0 ], src1->ne [1 ], 1 , ne10_padded, src0->type , stream);
2173- CUDA_CHECK (cudaGetLastError ());
2174-
2175- ggml_cuda_op_mul_mat_q (ctx, src0, src1, dst, (const char *)src0->data , nullptr , src1_quantized.get (), (float *)dst->data ,
2176- 0 , src0->ne [1 ], src1->ne [1 ], ne10_padded, stream);
2177- CUDA_CHECK (cudaGetLastError ());
2178- }
2179-
2180- if (!cgraph) return node_n;
2181-
2182- while (node_n + 1 < cgraph->n_nodes ) {
2183- dst = cgraph->nodes [node_n+1 ];
2184- if (ggml_is_empty (dst) || dst->op == GGML_OP_RESHAPE || dst->op == GGML_OP_TRANSPOSE || dst->op == GGML_OP_VIEW
2185- || dst->op == GGML_OP_PERMUTE || dst->op == GGML_OP_NONE) {
2186- ++node_n; continue ;
2187- }
2188- if (dst->op != GGML_OP_MUL_MAT || dst->src [1 ] != src1 || !ggml_is_quantized (dst->src [0 ]->type )) break ;
2189- if (!is_gemv && mmq_get_q8_1_ds_layout (src0->type ) != mmq_get_q8_1_ds_layout (dst->src [0 ]->type )) break ;
2190- if (is_gemv) {
2191- ggml_cuda_op_mul_mat_vec_q (ctx, dst->src [0 ], src1, dst, (const char *)dst->src [0 ]->data , nullptr , src1_quantized.get (),
2192- (float *)dst->data , 0 , dst->src [0 ]->ne [1 ], src1->ne [1 ], ne10_padded, stream);
2193- } else {
2194- ggml_cuda_op_mul_mat_q (ctx, dst->src [0 ], src1, dst, (const char *)dst->src [0 ]->data , nullptr , src1_quantized.get (),
2195- (float *)dst->data , 0 , dst->src [0 ]->ne [1 ], src1->ne [1 ], ne10_padded, stream);
2196- }
2197- CUDA_CHECK (cudaGetLastError ());
2198- ++node_n;
2199- }
2200-
2201- return node_n;
2202-
2203- }
2204-
2205- static int ggml_cuda_mul_mat (ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
2206- const ggml_cgraph * cgraph, int node_n) {
2151+ static void ggml_cuda_mul_mat (ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
22072152 const bool split = ggml_backend_buffer_is_cuda_split (src0->buffer );
22082153
22092154 // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
@@ -2248,10 +2193,6 @@ static int ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
22482193 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available (cc);
22492194 }
22502195
2251- if (!split && (use_mul_mat_vec_q || use_mul_mat_q) && src1->ne [2 ]*src1->ne [3 ] == 1 ) {
2252- return ggml_cuda_mul_mat_q (ctx, src0, src1, dst, cgraph, node_n, use_mul_mat_vec_q);
2253- }
2254-
22552196 // debug helpers
22562197 // printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
22572198 // printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -2279,7 +2220,6 @@ static int ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
22792220 } else {
22802221 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr );
22812222 }
2282- return node_n;
22832223}
22842224
22852225struct mmid_row_mapping {
@@ -2519,7 +2459,7 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
25192459 src1_row.data = src1_original + i11*nb11 + i12*nb12;
25202460 dst_row.data = dst_original + i1*nb1 + i2*nb2;
25212461
2522- ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row, nullptr , 0 );
2462+ ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
25232463 }
25242464 }
25252465 } else {
@@ -2570,7 +2510,7 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
25702510 dst_row.nb [2 ] = num_src1_rows*nb1;
25712511 dst_row.nb [3 ] = num_src1_rows*nb1;
25722512
2573- ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row, nullptr , 0 );
2513+ ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
25742514
25752515 {
25762516 dim3 block_dims (std::min ((unsigned int )ne0, 768u ));
@@ -2954,7 +2894,7 @@ static bool ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_te
29542894 ggml_cuda_op_mul_mat_q (ctx, &src0_1_row, &src1_row, &dst_row, (const char *)src0_1_row.data , nullptr , src1_quantized.get (), (float *)dst_row.data ,
29552895 0 , src0_1_row.ne [1 ], num_src1_rows, src1_padded_num_cols, stream);
29562896 } else {
2957- ggml_cuda_mul_mat (ctx, &src0_1_row, &src1_row, &dst_row, nullptr , 0 );
2897+ ggml_cuda_mul_mat (ctx, &src0_1_row, &src1_row, &dst_row);
29582898 }
29592899 CUDA_CHECK (cudaGetLastError ());
29602900
@@ -2971,7 +2911,7 @@ static bool ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_te
29712911 ggml_cuda_op_mul_mat_q (ctx, &src0_2_row, &src1_row, &dst_row, (const char *)src0_2_row.data , nullptr , src1_quantized.get (), (float *)dst_row.data ,
29722912 0 , src0_2_row.ne [1 ], num_src1_rows, src1_padded_num_cols, stream);
29732913 } else {
2974- ggml_cuda_mul_mat (ctx, &src0_2_row, &src1_row, &dst_row, nullptr , 0 );
2914+ ggml_cuda_mul_mat (ctx, &src0_2_row, &src1_row, &dst_row);
29752915 }
29762916 CUDA_CHECK (cudaGetLastError ());
29772917
@@ -3012,7 +2952,8 @@ static bool ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_te
30122952 (int )dst_row.ne [0 ], (int )dst_row.ne [1 ], (int )dst_row.ne [2 ], (int )dst_row.ne [3 ]);
30132953 first = false ;
30142954 }
3015- ggml_cuda_mul_mat (ctx, &final_src, &dst_row, &final_dst, nullptr , 0 );
2955+ ggml_cuda_mul_mat (ctx, &final_src, &dst_row, &final_dst);
2956+ // ggml_cuda_mul_mat(ctx, next->src[0], &dst_row, &final_dst);
30162957 CUDA_CHECK (cudaGetLastError ());
30172958
30182959 dim3 block_dims (std::min ((unsigned int )next->ne [0 ], 768u ));
@@ -3095,7 +3036,8 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
30953036
30963037}
30973038
3098- static bool ggml_cuda_compute_forward (ggml_backend_cuda_context & ctx, struct ggml_tensor * dst, const ggml_cgraph * cgraph, int & i) {
3039+ static bool ggml_cuda_compute_forward (ggml_backend_cuda_context & ctx, struct ggml_tensor * dst, struct ggml_tensor * next,
3040+ const ggml_cgraph * cgraph, int & i) {
30993041 // why is this here instead of mul_mat?
31003042 if (dst->src [0 ] != nullptr && ggml_backend_buffer_is_cuda_split (dst->src [0 ]->buffer )) {
31013043 ggml_cuda_set_peer_access (dst->src [1 ]->ne [1 ], ctx.device );
@@ -3105,8 +3047,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
31053047 int64_t tim1 = ggml_time_us ();
31063048#endif
31073049
3108- auto next = i < cgraph->n_nodes - 1 ? cgraph->nodes [i+1 ] : nullptr ;
3109-
31103050 switch (dst->op ) {
31113051 case GGML_OP_ARGMAX:
31123052 ggml_cuda_argmax (ctx, dst);
@@ -3214,7 +3154,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
32143154 ggml_cuda_op_hardswish (ctx, dst);
32153155 break ;
32163156 default :
3217- return - 1 ;
3157+ return false ;
32183158 }
32193159 break ;
32203160 case GGML_OP_GLU:
@@ -3274,9 +3214,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
32743214 case GGML_OP_MUL_MAT:
32753215 if (dst->src [0 ]->ne [3 ] != dst->src [1 ]->ne [3 ]) {
32763216 GGML_CUDA_LOG_ERROR (" %s: cannot compute %s: src0->ne[3] = %" PRId64 " , src1->ne[3] = %" PRId64 " - fallback to CPU\n " , __func__, dst->name , dst->src [0 ]->ne [3 ], dst->src [1 ]->ne [3 ]);
3277- return - 1 ;
3217+ return false ;
32783218 } else {
3279- i = ggml_cuda_mul_mat (ctx, dst->src [0 ], dst->src [1 ], dst, cgraph, i );
3219+ ggml_cuda_mul_mat (ctx, dst->src [0 ], dst->src [1 ], dst);
32803220 }
32813221 break ;
32823222 case GGML_OP_MUL_MAT_ID:
@@ -3725,6 +3665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
37253665
37263666 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
37273667 ggml_tensor * node = cgraph->nodes [i];
3668+ ggml_tensor * next = i < cgraph->n_nodes -1 ? cgraph->nodes [i+1 ] : nullptr ;
37283669
37293670 if (ggml_is_empty (node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
37303671 continue ;
@@ -3759,7 +3700,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
37593700 GGML_UNUSED (integrated);
37603701#endif // NDEBUG
37613702
3762- bool ok = ggml_cuda_compute_forward (*cuda_ctx, node, cgraph, i);
3703+ bool ok = ggml_cuda_compute_forward (*cuda_ctx, node, next, cgraph, i);
37633704 if (!ok) {
37643705 GGML_CUDA_LOG_ERROR (" %s: op not supported %s (%s)\n " , __func__, node->name , ggml_op_name (node->op ));
37653706 }
0 commit comments