remove the op profiling code

Huaishun Hu · Huaishun Hu · commit 24c039b789bf · 2025-02-13T16:58:22.000+08:00
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -2188,27 +2188,6 @@ extern "C" {
     GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
     GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
 
-// #define GGML_OP_PERF
-// op: [ count, total_time ]
-enum OP_STAT_ENUM {
-    OP_COUNT = 0,
-    OP_TOTAL_TIME,
-    OP_STAT_ENUM_LEN,
-};
-enum MUL_MAT_BRANCH_ENUM {
-    mm_ggml_cuda_mul_mat_vec            = 0,
-    mm_ggml_cuda_mul_mat_batched_cublas = 1,
-    mm_ggml_cuda_op_mul_mat_vec         = 2,
-    mm_ggml_cuda_op_mul_mat_vec_q       = 3,
-    mm_ggml_cuda_op_mul_mat_q           = 4,
-    mm_ggml_cuda_op_mul_mat_cublas      = 5,
-    mm_gpu_branch_count                 = 6,
-};
-#if defined(GGML_OP_PERF)
-    static float op_stats[GGML_OP_COUNT][OP_STAT_ENUM_LEN] = {0};
-    static float mul_mat_branch_stats[mm_gpu_branch_count] = {0};
-#endif // defined(GGML_OP_PERF)
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1881,39 +1881,23 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-#if defined(GGML_OP_PERF)
-    const uint64_t mm_start_us = ggml_time_us();
-#endif // defined(GGML_OP_PERF)
-    enum MUL_MAT_BRANCH_ENUM mul_mat_branch;
-
     if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
         // the custom F16 vector kernel can be used over batched cuBLAS GEMM
         // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
         ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
-        mul_mat_branch = mm_ggml_cuda_mul_mat_vec;
     } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
                && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
         // general KQ + KQV multi-batch without FlashAttention
         ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
-        mul_mat_branch = mm_ggml_cuda_mul_mat_batched_cublas;
     } else if (use_mul_mat_vec) {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
-        mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec;
     } else if (use_mul_mat_vec_q) {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
-        mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec_q;
     } else if (use_mul_mat_q) {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
-        mul_mat_branch = mm_ggml_cuda_op_mul_mat_q;
     } else {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
-        mul_mat_branch = mm_ggml_cuda_op_mul_mat_cublas;
     }
-
-#if defined(GGML_OP_PERF)
-    const uint64_t mm_end_us = ggml_time_us();
-    mul_mat_branch_stats[mul_mat_branch] += mm_end_us - mm_start_us;
-#endif // defined(GGML_OP_PERF)
 }
 
 struct mmid_row_mapping {
@@ -2669,21 +2653,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                 }
 #endif
 
-#if defined(GGML_OP_PERF)
-                const uint64_t op_start_us = ggml_time_us();
-#endif // defined(GGML_OP_PERF)
-
                 bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
                 if (!ok) {
                     GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
                 }
                 GGML_ASSERT(ok);
-
-#if defined(GGML_OP_PERF)
-                const uint64_t op_end_us = ggml_time_us();
-                op_stats[node->op][OP_COUNT]      += 1;
-                op_stats[node->op][OP_TOTAL_TIME] += op_end_us - op_start_us;
-#endif // defined(GGML_OP_PERF)
             }
         }
 
@@ -2793,45 +2767,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
 
     evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
 
-#if defined(GGML_OP_PERF)
-    {
-        FILE *logFile = fopen("ggml_op_perf.log", "a");
-        fprintf(logFile, "## compute stats for each op: ##################################################\n");
-        fprintf(logFile, ">> cc = %d, vmm = %d, total_vram = %u\n",
-            ggml_cuda_info().devices[cuda_ctx->device].cc,
-            ggml_cuda_info().devices[cuda_ctx->device].vmm,
-            ggml_cuda_info().devices[cuda_ctx->device].total_vram
-        );
-        float total_time = 0, total_count = 0;
-        for (int i = 0; i < GGML_OP_COUNT; ++i) {
-            total_count += op_stats[i][OP_COUNT];
-            total_time  += op_stats[i][OP_TOTAL_TIME];
-        }
-        for (int i = 0; i < GGML_OP_COUNT; ++i) {
-            fprintf(logFile,
-                "OP[%d] Stat: count = %9.0f, count%% = %3.2f%%, time = %12.0f, time%% = %3.2f%%\n",
-                i,
-                op_stats[i][OP_COUNT],      100 * op_stats[i][OP_COUNT] / total_count,
-                op_stats[i][OP_TOTAL_TIME], 100 * op_stats[i][OP_TOTAL_TIME] / total_time 
-            );
-        }
-        float total_mm_time = op_stats[GGML_OP_MUL_MAT][OP_TOTAL_TIME];
-        // float total_mm_time = 0;
-        // for (int i = 0; i < mm_gpu_branch_count; ++i) {
-        //     total_mm_time  += mul_mat_branch_stats[i];
-        // }
-        for (int i = 0; i < mm_gpu_branch_count; i++) {
-            fprintf(logFile,
-                "MM[%d] Stat: time = %12.0f, time%% = %3.2f%%\n",
-                i,
-                mul_mat_branch_stats[i],
-                100 * mul_mat_branch_stats[i] / total_mm_time
-            );
-        }
-        fclose(logFile);
-    }
-#endif // defined(GGML_OP_PERF)
-
     return GGML_STATUS_SUCCESS;
 }