@@ -1881,39 +1881,23 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
18811881 // printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
18821882 // printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
18831883
1884- #if defined(GGML_OP_PERF)
1885- const uint64_t mm_start_us = ggml_time_us ();
1886- #endif // defined(GGML_OP_PERF)
1887- enum MUL_MAT_BRANCH_ENUM mul_mat_branch;
1888-
18891884 if (!split && use_mul_mat_vec && dst->ne [3 ] == 1 && (src0->ne [1 ] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
18901885 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
18911886 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
18921887 ggml_cuda_mul_mat_vec (ctx, src0, src1, dst);
1893- mul_mat_branch = mm_ggml_cuda_mul_mat_vec;
18941888 } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
18951889 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
18961890 // general KQ + KQV multi-batch without FlashAttention
18971891 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
1898- mul_mat_branch = mm_ggml_cuda_mul_mat_batched_cublas;
18991892 } else if (use_mul_mat_vec) {
19001893 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr );
1901- mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec;
19021894 } else if (use_mul_mat_vec_q) {
19031895 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
1904- mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec_q;
19051896 } else if (use_mul_mat_q) {
19061897 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
1907- mul_mat_branch = mm_ggml_cuda_op_mul_mat_q;
19081898 } else {
19091899 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr );
1910- mul_mat_branch = mm_ggml_cuda_op_mul_mat_cublas;
19111900 }
1912-
1913- #if defined(GGML_OP_PERF)
1914- const uint64_t mm_end_us = ggml_time_us ();
1915- mul_mat_branch_stats[mul_mat_branch] += mm_end_us - mm_start_us;
1916- #endif // defined(GGML_OP_PERF)
19171901}
19181902
19191903struct mmid_row_mapping {
@@ -2669,21 +2653,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
26692653 }
26702654#endif
26712655
2672- #if defined(GGML_OP_PERF)
2673- const uint64_t op_start_us = ggml_time_us ();
2674- #endif // defined(GGML_OP_PERF)
2675-
26762656 bool ok = ggml_cuda_compute_forward (*cuda_ctx, node);
26772657 if (!ok) {
26782658 GGML_LOG_ERROR (" %s: op not supported %s (%s)\n " , __func__, node->name , ggml_op_name (node->op ));
26792659 }
26802660 GGML_ASSERT (ok);
2681-
2682- #if defined(GGML_OP_PERF)
2683- const uint64_t op_end_us = ggml_time_us ();
2684- op_stats[node->op ][OP_COUNT] += 1 ;
2685- op_stats[node->op ][OP_TOTAL_TIME] += op_end_us - op_start_us;
2686- #endif // defined(GGML_OP_PERF)
26872661 }
26882662 }
26892663
@@ -2793,45 +2767,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
27932767
27942768 evaluate_and_capture_cuda_graph (cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
27952769
2796- #if defined(GGML_OP_PERF)
2797- {
2798- FILE *logFile = fopen (" ggml_op_perf.log" , " a" );
2799- fprintf (logFile, " ## compute stats for each op: ##################################################\n " );
2800- fprintf (logFile, " >> cc = %d, vmm = %d, total_vram = %u\n " ,
2801- ggml_cuda_info ().devices [cuda_ctx->device ].cc ,
2802- ggml_cuda_info ().devices [cuda_ctx->device ].vmm ,
2803- ggml_cuda_info ().devices [cuda_ctx->device ].total_vram
2804- );
2805- float total_time = 0 , total_count = 0 ;
2806- for (int i = 0 ; i < GGML_OP_COUNT; ++i) {
2807- total_count += op_stats[i][OP_COUNT];
2808- total_time += op_stats[i][OP_TOTAL_TIME];
2809- }
2810- for (int i = 0 ; i < GGML_OP_COUNT; ++i) {
2811- fprintf (logFile,
2812- " OP[%d] Stat: count = %9.0f, count%% = %3.2f%%, time = %12.0f, time%% = %3.2f%%\n " ,
2813- i,
2814- op_stats[i][OP_COUNT], 100 * op_stats[i][OP_COUNT] / total_count,
2815- op_stats[i][OP_TOTAL_TIME], 100 * op_stats[i][OP_TOTAL_TIME] / total_time
2816- );
2817- }
2818- float total_mm_time = op_stats[GGML_OP_MUL_MAT][OP_TOTAL_TIME];
2819- // float total_mm_time = 0;
2820- // for (int i = 0; i < mm_gpu_branch_count; ++i) {
2821- // total_mm_time += mul_mat_branch_stats[i];
2822- // }
2823- for (int i = 0 ; i < mm_gpu_branch_count; i++) {
2824- fprintf (logFile,
2825- " MM[%d] Stat: time = %12.0f, time%% = %3.2f%%\n " ,
2826- i,
2827- mul_mat_branch_stats[i],
2828- 100 * mul_mat_branch_stats[i] / total_mm_time
2829- );
2830- }
2831- fclose (logFile);
2832- }
2833- #endif // defined(GGML_OP_PERF)
2834-
28352770 return GGML_STATUS_SUCCESS;
28362771}
28372772
0 commit comments