Skip to content

Commit 5751a74

Browse files
committed
Added perfetto support (CPU + Vulkan) backends
This uses perfetto in process profiling, and will produce a perfetto binary by the end of the inference. This is very useful to help visualise how the handles the inference. Build: cmake -S . -B build-vk -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGGML_VULKAN=ON cmake --build build-vk -j8 Run: GGML_VK_PERF_SILENT=1 GGML_VK_PERF_LOGGER=1 LLAMA_PERFETTO_TRACE=./out.pftrace build-vk/bin/llama-cli -m model.gguf Test: Tested on M4 Mac In detail this patch does the following: 1. Including the `LlamaPerfetto.h` header file, which contains the definitions for the Perfetto-related functions and variables used in this example. 2. Calling the `llama_perfetto_start()` function to start tracing at the beginning of the conversation. 3. Calling the `llama_perfetto_stop_flush()` function to stop tracing and flush the trace after each generation. 4. Adding a call to the `llama_perfetto_trace_begin_with_text()` function to begin an event in Perfetto with a text description of the current evaluation. 5. Adding a call to the `llama_perfetto_trace_end()` function to end the event after each evaluation. 6. Adding a call to the `llama_perfetto_counter_tokens_per_s()` function to update the Perfetto counter for tokens per second during idle periods. 7. Calling the `llama_perfetto_emit_gpu_timeline()` function to emit GPU timeline slices into Perfetto. 8. Adding a call to the `llama_perfetto_print_gpu_stats()` function to print GPU statistics at idle periods. 9. Calling the `llama_perfetto_flush_dump_stats()` function to flush and dump the Perfetto trace stats to a file at idle periods.
1 parent 3d16b29 commit 5751a74

File tree

16 files changed

+243357
-12
lines changed

16 files changed

+243357
-12
lines changed

.github/workflows/vulkan-build.yml

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
name: Vulkan Build (No Run)
2+
3+
on:
4+
push:
5+
branches: ["**"]
6+
pull_request:
7+
8+
jobs:
9+
build-vulkan:
10+
runs-on: ubuntu-latest
11+
steps:
12+
- name: Checkout repository
13+
uses: actions/checkout@v4
14+
with:
15+
submodules: recursive
16+
17+
- name: Install Vulkan build dependencies
18+
run: |
19+
sudo apt-get update
20+
# Vulkan SDK headers/libs + tools needed by GGML's Vulkan backend
21+
sudo apt-get install -y \
22+
libvulkan-dev \
23+
vulkan-validationlayers-dev \
24+
spirv-tools \
25+
libglslang-dev \
26+
glslang-tools
27+
28+
- name: Configure (CMake, Vulkan enabled)
29+
run: |
30+
cmake -S . -B build-vk -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGGML_VULKAN=ON
31+
32+
- name: Build
33+
run: |
34+
cmake --build build-vk -j8
35+
36+
- name: Verify build outputs
37+
run: |
38+
ls -la build-vk/bin || true
39+
test -e build-vk/bin/llama-cli
40+
41+
- name: Show test command (not executed)
42+
run: |
43+
echo "To run (requires Vulkan-capable GPU):"
44+
echo "GGML_VK_PERF_SILENT=1 GGML_VK_PERF_LOGGER=1 LLAMA_PERFETTO_TRACE=./<out>.pftrace build-vk/bin/llama-cli -m <model>.gguf"
45+
46+
- name: Upload build artifacts
47+
uses: actions/upload-artifact@v4
48+
with:
49+
name: build-vk-bin
50+
path: build-vk/bin/

common/common.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,9 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
348348
return true;
349349
}
350350

351+
extern "C" void llama_perfetto_try_start_from_env(void);
352+
extern "C" void llama_perfetto_stop_flush(void);
353+
351354
void common_init() {
352355
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
353356
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
@@ -362,6 +365,11 @@ void common_init() {
362365
#endif
363366

364367
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
368+
369+
// Start Perfetto in-process tracing if requested via env vars.
370+
// This enables trace generation for all tools uniformly.
371+
llama_perfetto_try_start_from_env();
372+
atexit([](){ llama_perfetto_stop_flush(); });
365373
}
366374

367375
std::string common_params_get_system_info(const common_params & params) {

ggml/include/ggml-vulkan.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(voi
2424

2525
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
2626

27+
// Utility: collect basic GPU counters via Vulkan pipeline statistics
28+
// and dump them to a text file at `path`. Returns true on success.
29+
// Currently records `VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT`.
30+
GGML_BACKEND_API bool ggml_backend_vk_dump_pipeline_stats(int device, const char * path);
31+
2732
#ifdef __cplusplus
2833
}
2934
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@
3838
#include <syscall.h>
3939
#endif
4040

41+
// Weak no-op perfetto shims to avoid link errors when ggml is linked as a shared library
42+
// and the C++ perfetto glue is not part of the ggml target.
43+
__attribute__((weak)) void llama_perfetto_trace_begin(const char * name) { (void)name; }
44+
__attribute__((weak)) void llama_perfetto_trace_end(void) { }
45+
4146
#ifdef GGML_USE_OPENMP
4247
#include <omp.h>
4348
#endif
@@ -1754,35 +1759,63 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
17541759
} break;
17551760
case GGML_OP_SILU_BACK:
17561761
{
1762+
// Perfetto: SiLU Backward
1763+
#include "../../include/llama_perfetto.h"
1764+
llama_perfetto_trace_begin("silu_back");
17571765
ggml_compute_forward_silu_back(params, tensor);
1766+
llama_perfetto_trace_end();
17581767
} break;
17591768
case GGML_OP_NORM:
17601769
{
1770+
// Perfetto: Norm
1771+
#include "../../include/llama_perfetto.h"
1772+
llama_perfetto_trace_begin("norm");
17611773
ggml_compute_forward_norm(params, tensor);
1774+
llama_perfetto_trace_end();
17621775
} break;
17631776
case GGML_OP_RMS_NORM:
17641777
{
1778+
// Perfetto: RMSNorm
1779+
#include "../../include/llama_perfetto.h"
1780+
llama_perfetto_trace_begin("rms_norm");
17651781
ggml_compute_forward_rms_norm(params, tensor);
1782+
llama_perfetto_trace_end();
17661783
} break;
17671784
case GGML_OP_RMS_NORM_BACK:
17681785
{
1786+
// Perfetto: RMSNorm Backward
1787+
#include "../../include/llama_perfetto.h"
1788+
llama_perfetto_trace_begin("rms_norm_back");
17691789
ggml_compute_forward_rms_norm_back(params, tensor);
1790+
llama_perfetto_trace_end();
17701791
} break;
17711792
case GGML_OP_GROUP_NORM:
17721793
{
1794+
// Perfetto: GroupNorm
1795+
#include "../../include/llama_perfetto.h"
1796+
llama_perfetto_trace_begin("group_norm");
17731797
ggml_compute_forward_group_norm(params, tensor);
1798+
llama_perfetto_trace_end();
17741799
} break;
17751800
case GGML_OP_L2_NORM:
17761801
{
17771802
ggml_compute_forward_l2_norm(params, tensor);
17781803
} break;
17791804
case GGML_OP_MUL_MAT:
17801805
{
1806+
// Perfetto: MatMul
1807+
#include "../../include/llama_perfetto.h"
1808+
llama_perfetto_trace_begin("matmul");
17811809
ggml_compute_forward_mul_mat(params, tensor);
1810+
llama_perfetto_trace_end();
17821811
} break;
17831812
case GGML_OP_MUL_MAT_ID:
17841813
{
1814+
// Perfetto: MatMul (ID)
1815+
#include "../../include/llama_perfetto.h"
1816+
llama_perfetto_trace_begin("matmul_id");
17851817
ggml_compute_forward_mul_mat_id(params, tensor);
1818+
llama_perfetto_trace_end();
17861819
} break;
17871820
case GGML_OP_OUT_PROD:
17881821
{
@@ -1846,19 +1879,35 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
18461879
} break;
18471880
case GGML_OP_SOFT_MAX:
18481881
{
1882+
// Perfetto: Softmax
1883+
#include "../../include/llama_perfetto.h"
1884+
llama_perfetto_trace_begin("softmax");
18491885
ggml_compute_forward_soft_max(params, tensor);
1886+
llama_perfetto_trace_end();
18501887
} break;
18511888
case GGML_OP_SOFT_MAX_BACK:
18521889
{
1890+
// Perfetto: Softmax Backward
1891+
#include "../../include/llama_perfetto.h"
1892+
llama_perfetto_trace_begin("softmax_back");
18531893
ggml_compute_forward_soft_max_ext_back(params, tensor);
1894+
llama_perfetto_trace_end();
18541895
} break;
18551896
case GGML_OP_ROPE:
18561897
{
1898+
// Perfetto: RoPE
1899+
#include "../../include/llama_perfetto.h"
1900+
llama_perfetto_trace_begin("rope");
18571901
ggml_compute_forward_rope(params, tensor);
1902+
llama_perfetto_trace_end();
18581903
} break;
18591904
case GGML_OP_ROPE_BACK:
18601905
{
1906+
// Perfetto: RoPE Backward
1907+
#include "../../include/llama_perfetto.h"
1908+
llama_perfetto_trace_begin("rope_back");
18611909
ggml_compute_forward_rope_back(params, tensor);
1910+
llama_perfetto_trace_end();
18621911
} break;
18631912
case GGML_OP_CLAMP:
18641913
{
@@ -1934,18 +1983,30 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
19341983
} break;
19351984
case GGML_OP_LEAKY_RELU:
19361985
{
1986+
// Perfetto: LeakyReLU
1987+
#include "../../include/llama_perfetto.h"
1988+
llama_perfetto_trace_begin("leaky_relu");
19371989
ggml_compute_forward_leaky_relu(params, tensor);
1990+
llama_perfetto_trace_end();
19381991
} break;
19391992
case GGML_OP_FLASH_ATTN_EXT:
19401993
{
1994+
// Perfetto: FlashAttention (fwd)
1995+
#include "../../include/llama_perfetto.h"
1996+
llama_perfetto_trace_begin("flash_attn");
19411997
ggml_compute_forward_flash_attn_ext(params, tensor);
1998+
llama_perfetto_trace_end();
19421999
} break;
19432000
case GGML_OP_FLASH_ATTN_BACK:
19442001
{
2002+
// Perfetto: FlashAttention (back)
2003+
#include "../../include/llama_perfetto.h"
2004+
llama_perfetto_trace_begin("flash_attn_back");
19452005
int32_t t = ggml_get_op_params_i32(tensor, 0);
19462006
GGML_ASSERT(t == 0 || t == 1);
19472007
bool masked = t != 0;
19482008
ggml_compute_forward_flash_attn_back(params, masked, tensor);
2009+
llama_perfetto_trace_end();
19492010
} break;
19502011
case GGML_OP_SSM_CONV:
19512012
{
@@ -1965,11 +2026,21 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
19652026
} break;
19662027
case GGML_OP_UNARY:
19672028
{
2029+
// Perfetto: Unary (may include relu/gelu/silu/etc.)
2030+
#include "../../include/llama_perfetto.h"
2031+
enum ggml_unary_op uop = ggml_get_unary_op(tensor);
2032+
const char * uname = ggml_unary_op_name(uop);
2033+
llama_perfetto_trace_begin(uname);
19682034
ggml_compute_forward_unary(params, tensor);
2035+
llama_perfetto_trace_end();
19692036
} break;
19702037
case GGML_OP_GLU:
19712038
{
2039+
// Perfetto: GLU
2040+
#include "../../include/llama_perfetto.h"
2041+
llama_perfetto_trace_begin("glu");
19722042
ggml_compute_forward_glu(params, tensor);
2043+
llama_perfetto_trace_end();
19732044
} break;
19742045
case GGML_OP_GET_REL_POS:
19752046
{

ggml/src/ggml-vulkan/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ if (Vulkan_FOUND)
4646

4747
ggml_add_backend_library(ggml-vulkan
4848
ggml-vulkan.cpp
49+
llama_perfetto_stubs.c
4950
../../include/ggml-vulkan.h
5051
)
5152

0 commit comments

Comments
 (0)