Skip to content

Commit 24c039b

Browse files
author
Huaishun Hu
committed
remove the op profiling code
1 parent 2030e59 commit 24c039b

File tree

2 files changed

+0
-86
lines changed

2 files changed

+0
-86
lines changed

ggml/include/ggml.h

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2188,27 +2188,6 @@ extern "C" {
21882188
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
21892189
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
21902190

2191-
// #define GGML_OP_PERF
2192-
// op: [ count, total_time ]
2193-
enum OP_STAT_ENUM {
2194-
OP_COUNT = 0,
2195-
OP_TOTAL_TIME,
2196-
OP_STAT_ENUM_LEN,
2197-
};
2198-
enum MUL_MAT_BRANCH_ENUM {
2199-
mm_ggml_cuda_mul_mat_vec = 0,
2200-
mm_ggml_cuda_mul_mat_batched_cublas = 1,
2201-
mm_ggml_cuda_op_mul_mat_vec = 2,
2202-
mm_ggml_cuda_op_mul_mat_vec_q = 3,
2203-
mm_ggml_cuda_op_mul_mat_q = 4,
2204-
mm_ggml_cuda_op_mul_mat_cublas = 5,
2205-
mm_gpu_branch_count = 6,
2206-
};
2207-
#if defined(GGML_OP_PERF)
2208-
static float op_stats[GGML_OP_COUNT][OP_STAT_ENUM_LEN] = {0};
2209-
static float mul_mat_branch_stats[mm_gpu_branch_count] = {0};
2210-
#endif // defined(GGML_OP_PERF)
2211-
22122191
#ifdef __cplusplus
22132192
}
22142193
#endif

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1881,39 +1881,23 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
18811881
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
18821882
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
18831883

1884-
#if defined(GGML_OP_PERF)
1885-
const uint64_t mm_start_us = ggml_time_us();
1886-
#endif // defined(GGML_OP_PERF)
1887-
enum MUL_MAT_BRANCH_ENUM mul_mat_branch;
1888-
18891884
if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
18901885
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
18911886
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
18921887
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
1893-
mul_mat_branch = mm_ggml_cuda_mul_mat_vec;
18941888
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
18951889
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
18961890
// general KQ + KQV multi-batch without FlashAttention
18971891
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1898-
mul_mat_branch = mm_ggml_cuda_mul_mat_batched_cublas;
18991892
} else if (use_mul_mat_vec) {
19001893
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
1901-
mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec;
19021894
} else if (use_mul_mat_vec_q) {
19031895
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
1904-
mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec_q;
19051896
} else if (use_mul_mat_q) {
19061897
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
1907-
mul_mat_branch = mm_ggml_cuda_op_mul_mat_q;
19081898
} else {
19091899
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
1910-
mul_mat_branch = mm_ggml_cuda_op_mul_mat_cublas;
19111900
}
1912-
1913-
#if defined(GGML_OP_PERF)
1914-
const uint64_t mm_end_us = ggml_time_us();
1915-
mul_mat_branch_stats[mul_mat_branch] += mm_end_us - mm_start_us;
1916-
#endif // defined(GGML_OP_PERF)
19171901
}
19181902

19191903
struct mmid_row_mapping {
@@ -2669,21 +2653,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
26692653
}
26702654
#endif
26712655

2672-
#if defined(GGML_OP_PERF)
2673-
const uint64_t op_start_us = ggml_time_us();
2674-
#endif // defined(GGML_OP_PERF)
2675-
26762656
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
26772657
if (!ok) {
26782658
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
26792659
}
26802660
GGML_ASSERT(ok);
2681-
2682-
#if defined(GGML_OP_PERF)
2683-
const uint64_t op_end_us = ggml_time_us();
2684-
op_stats[node->op][OP_COUNT] += 1;
2685-
op_stats[node->op][OP_TOTAL_TIME] += op_end_us - op_start_us;
2686-
#endif // defined(GGML_OP_PERF)
26872661
}
26882662
}
26892663

@@ -2793,45 +2767,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
27932767

27942768
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
27952769

2796-
#if defined(GGML_OP_PERF)
2797-
{
2798-
FILE *logFile = fopen("ggml_op_perf.log", "a");
2799-
fprintf(logFile, "## compute stats for each op: ##################################################\n");
2800-
fprintf(logFile, ">> cc = %d, vmm = %d, total_vram = %u\n",
2801-
ggml_cuda_info().devices[cuda_ctx->device].cc,
2802-
ggml_cuda_info().devices[cuda_ctx->device].vmm,
2803-
ggml_cuda_info().devices[cuda_ctx->device].total_vram
2804-
);
2805-
float total_time = 0, total_count = 0;
2806-
for (int i = 0; i < GGML_OP_COUNT; ++i) {
2807-
total_count += op_stats[i][OP_COUNT];
2808-
total_time += op_stats[i][OP_TOTAL_TIME];
2809-
}
2810-
for (int i = 0; i < GGML_OP_COUNT; ++i) {
2811-
fprintf(logFile,
2812-
"OP[%d] Stat: count = %9.0f, count%% = %3.2f%%, time = %12.0f, time%% = %3.2f%%\n",
2813-
i,
2814-
op_stats[i][OP_COUNT], 100 * op_stats[i][OP_COUNT] / total_count,
2815-
op_stats[i][OP_TOTAL_TIME], 100 * op_stats[i][OP_TOTAL_TIME] / total_time
2816-
);
2817-
}
2818-
float total_mm_time = op_stats[GGML_OP_MUL_MAT][OP_TOTAL_TIME];
2819-
// float total_mm_time = 0;
2820-
// for (int i = 0; i < mm_gpu_branch_count; ++i) {
2821-
// total_mm_time += mul_mat_branch_stats[i];
2822-
// }
2823-
for (int i = 0; i < mm_gpu_branch_count; i++) {
2824-
fprintf(logFile,
2825-
"MM[%d] Stat: time = %12.0f, time%% = %3.2f%%\n",
2826-
i,
2827-
mul_mat_branch_stats[i],
2828-
100 * mul_mat_branch_stats[i] / total_mm_time
2829-
);
2830-
}
2831-
fclose(logFile);
2832-
}
2833-
#endif // defined(GGML_OP_PERF)
2834-
28352770
return GGML_STATUS_SUCCESS;
28362771
}
28372772

0 commit comments

Comments
 (0)