@@ -1769,23 +1769,39 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
17691769 // printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
17701770 // printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
17711771
1772+ #if defined(GGML_OP_PERF)
1773+ const uint64_t mm_start_us = ggml_time_us ();
1774+ #endif // defined(GGML_OP_PERF)
1775+ enum MUL_MAT_BRANCH_ENUM mul_mat_branch;
1776+
17721777 if (!split && use_mul_mat_vec && dst->ne [3 ] == 1 && (src0->ne [1 ] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
17731778 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
17741779 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
17751780 ggml_cuda_mul_mat_vec (ctx, src0, src1, dst);
1781+ mul_mat_branch = mm_ggml_cuda_mul_mat_vec;
17761782 } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
17771783 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
17781784 // general KQ + KQV multi-batch without FlashAttention
17791785 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
1786+ mul_mat_branch = mm_ggml_cuda_mul_mat_batched_cublas;
17801787 } else if (use_mul_mat_vec) {
17811788 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr );
1789+ mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec;
17821790 } else if (use_mul_mat_vec_q) {
17831791 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
1792+ mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec_q;
17841793 } else if (use_mul_mat_q) {
17851794 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
1795+ mul_mat_branch = mm_ggml_cuda_op_mul_mat_q;
17861796 } else {
17871797 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr );
1798+ mul_mat_branch = mm_ggml_cuda_op_mul_mat_cublas;
17881799 }
1800+
1801+ #if defined(GGML_OP_PERF)
1802+ const uint64_t mm_end_us = ggml_time_us ();
1803+ mul_mat_branch_stats[mul_mat_branch] += mm_end_us - mm_start_us;
1804+ #endif // defined(GGML_OP_PERF)
17891805}
17901806
17911807struct mmid_row_mapping {
@@ -2634,6 +2650,19 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
26342650 op_stats[i][OP_TOTAL_TIME], 100 * op_stats[i][OP_TOTAL_TIME] / total_time
26352651 );
26362652 }
2653+ float total_mm_time = op_stats[GGML_OP_MUL_MAT][OP_TOTAL_TIME];
2654+ // float total_mm_time = 0;
2655+ // for (int i = 0; i < mm_gpu_branch_count; ++i) {
2656+ // total_mm_time += mul_mat_branch_stats[i];
2657+ // }
2658+ for (int i = 0 ; i < mm_gpu_branch_count; i++) {
2659+ fprintf (logFile,
2660+ " MM[%d] Stat: time = %12.0f, time%% = %3.2f%%\n " ,
2661+ i,
2662+ mul_mat_branch_stats[i],
2663+ 100 * mul_mat_branch_stats[i] / total_mm_time
2664+ );
2665+ }
26372666 fclose (logFile);
26382667 }
26392668#endif // defined(GGML_OP_PERF)
0 commit comments