ggml-org
diff --git a/‎.github/workflows/vulkan-build.yml‎
Lines changed: 50 additions & 0 deletions b/‎.github/workflows/vulkan-build.yml‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 8 additions & 0 deletions b/‎common/common.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎ggml/include/ggml-vulkan.h‎
Lines changed: 5 additions & 0 deletions b/‎ggml/include/ggml-vulkan.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 71 additions & 0 deletions b/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎ggml/src/ggml-vulkan/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎ggml/src/ggml-vulkan/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,50 @@
+name: Vulkan Build (No Run)
+
+on:
+  push:
+    branches: ["**"]
+  pull_request:
+
+jobs:
+  build-vulkan:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Install Vulkan build dependencies
+        run: |
+          sudo apt-get update
+          # Vulkan SDK headers/libs + tools needed by GGML's Vulkan backend
+          sudo apt-get install -y \
+            libvulkan-dev \
+            vulkan-validationlayers-dev \
+            spirv-tools \
+            libglslang-dev \
+            glslang-tools
+
+      - name: Configure (CMake, Vulkan enabled)
+        run: |
+          cmake -S . -B build-vk -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGGML_VULKAN=ON
+
+      - name: Build
+        run: |
+          cmake --build build-vk -j8
+
+      - name: Verify build outputs
+        run: |
+          ls -la build-vk/bin || true
+          test -e build-vk/bin/llama-cli
+
+      - name: Show test command (not executed)
+        run: |
+          echo "To run (requires Vulkan-capable GPU):"
+          echo "GGML_VK_PERF_SILENT=1 GGML_VK_PERF_LOGGER=1 LLAMA_PERFETTO_TRACE=./<out>.pftrace build-vk/bin/llama-cli -m <model>.gguf"
+
+      - name: Upload build artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-vk-bin
+          path: build-vk/bin/
@@ -348,6 +348,9 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
     return true;
 }
 
+extern "C" void llama_perfetto_try_start_from_env(void);
+extern "C" void llama_perfetto_stop_flush(void);
+
 void common_init() {
     llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
         if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
@@ -362,6 +365,11 @@ void common_init() {
 #endif
 
     LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+
+    // Start Perfetto in-process tracing if requested via env vars.
+    // This enables trace generation for all tools uniformly.
+    llama_perfetto_try_start_from_env();
+    atexit([](){ llama_perfetto_stop_flush(); });
 }
 
 std::string common_params_get_system_info(const common_params & params) {
 
@@ -24,6 +24,11 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(voi
 
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
 
+// Utility: collect basic GPU counters via Vulkan pipeline statistics
+// and dump them to a text file at `path`. Returns true on success.
+// Currently records `VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT`.
+GGML_BACKEND_API bool ggml_backend_vk_dump_pipeline_stats(int device, const char * path);
+
 #ifdef  __cplusplus
 }
 #endif
@@ -38,6 +38,11 @@
 #include <syscall.h>
 #endif
 
+// Weak no-op perfetto shims to avoid link errors when ggml is linked as a shared library
+// and the C++ perfetto glue is not part of the ggml target.
+__attribute__((weak)) void llama_perfetto_trace_begin(const char * name) { (void)name; }
+__attribute__((weak)) void llama_perfetto_trace_end(void) { }
+
 #ifdef GGML_USE_OPENMP
 #include <omp.h>
 #endif
@@ -1754,35 +1759,63 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_SILU_BACK:
             {
+                // Perfetto: SiLU Backward
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("silu_back");
                 ggml_compute_forward_silu_back(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_NORM:
             {
+                // Perfetto: Norm
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("norm");
                 ggml_compute_forward_norm(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_RMS_NORM:
             {
+                // Perfetto: RMSNorm
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("rms_norm");
                 ggml_compute_forward_rms_norm(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_RMS_NORM_BACK:
             {
+                // Perfetto: RMSNorm Backward
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("rms_norm_back");
                 ggml_compute_forward_rms_norm_back(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_GROUP_NORM:
             {
+                // Perfetto: GroupNorm
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("group_norm");
                 ggml_compute_forward_group_norm(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_L2_NORM:
             {
                 ggml_compute_forward_l2_norm(params, tensor);
             } break;
         case GGML_OP_MUL_MAT:
             {
+                // Perfetto: MatMul
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("matmul");
                 ggml_compute_forward_mul_mat(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_MUL_MAT_ID:
             {
+                // Perfetto: MatMul (ID)
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("matmul_id");
                 ggml_compute_forward_mul_mat_id(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_OUT_PROD:
             {
@@ -1846,19 +1879,35 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_SOFT_MAX:
             {
+                // Perfetto: Softmax
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("softmax");
                 ggml_compute_forward_soft_max(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_SOFT_MAX_BACK:
             {
+                // Perfetto: Softmax Backward
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("softmax_back");
                 ggml_compute_forward_soft_max_ext_back(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_ROPE:
             {
+                // Perfetto: RoPE
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("rope");
                 ggml_compute_forward_rope(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_ROPE_BACK:
             {
+                // Perfetto: RoPE Backward
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("rope_back");
                 ggml_compute_forward_rope_back(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_CLAMP:
             {
@@ -1934,18 +1983,30 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_LEAKY_RELU:
             {
+                // Perfetto: LeakyReLU
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("leaky_relu");
                 ggml_compute_forward_leaky_relu(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_FLASH_ATTN_EXT:
             {
+                // Perfetto: FlashAttention (fwd)
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("flash_attn");
                 ggml_compute_forward_flash_attn_ext(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_FLASH_ATTN_BACK:
             {
+                // Perfetto: FlashAttention (back)
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("flash_attn_back");
                 int32_t t = ggml_get_op_params_i32(tensor, 0);
                 GGML_ASSERT(t == 0 || t == 1);
                 bool masked = t != 0;
                 ggml_compute_forward_flash_attn_back(params, masked, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_SSM_CONV:
             {
@@ -1965,11 +2026,21 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_UNARY:
             {
+                // Perfetto: Unary (may include relu/gelu/silu/etc.)
+                #include "../../include/llama_perfetto.h"
+                enum ggml_unary_op uop = ggml_get_unary_op(tensor);
+                const char * uname = ggml_unary_op_name(uop);
+                llama_perfetto_trace_begin(uname);
                 ggml_compute_forward_unary(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_GLU:
             {
+                // Perfetto: GLU
+                #include "../../include/llama_perfetto.h"
+                llama_perfetto_trace_begin("glu");
                 ggml_compute_forward_glu(params, tensor);
+                llama_perfetto_trace_end();
             } break;
         case GGML_OP_GET_REL_POS:
             {
 
@@ -46,6 +46,7 @@ if (Vulkan_FOUND)
 
     ggml_add_backend_library(ggml-vulkan
                              ggml-vulkan.cpp
+                             llama_perfetto_stubs.c
                              ../../include/ggml-vulkan.h
                             )
Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ if (Vulkan_FOUND)`
`46`	`46`
`47`	`47`	`ggml_add_backend_library(ggml-vulkan`
`48`	`48`	`ggml-vulkan.cpp`
	`49`	`+ llama_perfetto_stubs.c`
`49`	`50`	`../../include/ggml-vulkan.h`
`50`	`51`	`)`
`51`	`52`