Skip to content

Commit ca3ef2a

Browse files
author
Huaishun Hu
committed
perf mul_mat branch stats
1 parent 7420f98 commit ca3ef2a

File tree

2 files changed

+45
-6
lines changed

2 files changed

+45
-6
lines changed

ggml/include/ggml.h

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2250,14 +2250,24 @@ extern "C" {
22502250
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
22512251

22522252
#define GGML_OP_PERF
2253+
// op: [ count, total_time ]
2254+
enum OP_STAT_ENUM {
2255+
OP_COUNT = 0,
2256+
OP_TOTAL_TIME,
2257+
OP_STAT_ENUM_LEN,
2258+
};
2259+
enum MUL_MAT_BRANCH_ENUM {
2260+
mm_ggml_cuda_mul_mat_vec = 0,
2261+
mm_ggml_cuda_mul_mat_batched_cublas = 1,
2262+
mm_ggml_cuda_op_mul_mat_vec = 2,
2263+
mm_ggml_cuda_op_mul_mat_vec_q = 3,
2264+
mm_ggml_cuda_op_mul_mat_q = 4,
2265+
mm_ggml_cuda_op_mul_mat_cublas = 5,
2266+
mm_gpu_branch_count = 6,
2267+
};
22532268
#if defined(GGML_OP_PERF)
2254-
// op: [ count, total_time ]
2255-
enum OP_STAT_ENUM {
2256-
OP_COUNT = 0,
2257-
OP_TOTAL_TIME,
2258-
OP_STAT_ENUM_LEN,
2259-
};
22602269
static float op_stats[GGML_OP_COUNT][OP_STAT_ENUM_LEN] = {0};
2270+
static float mul_mat_branch_stats[mm_gpu_branch_count] = {0};
22612271
#endif // defined(GGML_OP_PERF)
22622272

22632273
#ifdef __cplusplus

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1769,23 +1769,39 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
17691769
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
17701770
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
17711771

1772+
#if defined(GGML_OP_PERF)
1773+
const uint64_t mm_start_us = ggml_time_us();
1774+
#endif // defined(GGML_OP_PERF)
1775+
enum MUL_MAT_BRANCH_ENUM mul_mat_branch;
1776+
17721777
if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
17731778
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
17741779
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
17751780
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
1781+
mul_mat_branch = mm_ggml_cuda_mul_mat_vec;
17761782
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
17771783
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
17781784
// general KQ + KQV multi-batch without FlashAttention
17791785
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1786+
mul_mat_branch = mm_ggml_cuda_mul_mat_batched_cublas;
17801787
} else if (use_mul_mat_vec) {
17811788
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
1789+
mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec;
17821790
} else if (use_mul_mat_vec_q) {
17831791
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
1792+
mul_mat_branch = mm_ggml_cuda_op_mul_mat_vec_q;
17841793
} else if (use_mul_mat_q) {
17851794
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
1795+
mul_mat_branch = mm_ggml_cuda_op_mul_mat_q;
17861796
} else {
17871797
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
1798+
mul_mat_branch = mm_ggml_cuda_op_mul_mat_cublas;
17881799
}
1800+
1801+
#if defined(GGML_OP_PERF)
1802+
const uint64_t mm_end_us = ggml_time_us();
1803+
mul_mat_branch_stats[mul_mat_branch] += mm_end_us - mm_start_us;
1804+
#endif // defined(GGML_OP_PERF)
17891805
}
17901806

17911807
struct mmid_row_mapping {
@@ -2634,6 +2650,19 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
26342650
op_stats[i][OP_TOTAL_TIME], 100 * op_stats[i][OP_TOTAL_TIME] / total_time
26352651
);
26362652
}
2653+
float total_mm_time = op_stats[GGML_OP_MUL_MAT][OP_TOTAL_TIME];
2654+
// float total_mm_time = 0;
2655+
// for (int i = 0; i < mm_gpu_branch_count; ++i) {
2656+
// total_mm_time += mul_mat_branch_stats[i];
2657+
// }
2658+
for (int i = 0; i < mm_gpu_branch_count; i++) {
2659+
fprintf(logFile,
2660+
"MM[%d] Stat: time = %12.0f, time%% = %3.2f%%\n",
2661+
i,
2662+
mul_mat_branch_stats[i],
2663+
100 * mul_mat_branch_stats[i] / total_mm_time
2664+
);
2665+
}
26372666
fclose(logFile);
26382667
}
26392668
#endif // defined(GGML_OP_PERF)

0 commit comments

Comments
 (0)