vulkan: query register count and use it in a better split_k heuristic

jeffbolznv · jeffbolznv · commit 2e234f0bd527 · 2025-03-11T10:54:03.000-05:00
Use VK_KHR_pipeline_executable_properties to query the register count, and use that to try to better estimate how many workgroups can fit in the SMs. Particularly with recent tile size changes (ggml-org#12258) the old heuristic is out of date. This heuristic benefits both coopmat1 and coopmat2 paths on NVIDIA. Would be good if somebody can hook up the missing details for other hardware. Calling getPipelineExecutableStatisticsKHR required more fully initializing Vulkan-HPP. The steps needed are documented in the Vulkan-HPP readme.
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5,8 +5,14 @@
 #include "ggml-cpu.h"
 #endif
 
+// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
+#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
+
 #include <vulkan/vulkan.hpp>
 
+// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
+VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+
 #include <algorithm>
 #include <cmath>
 #include <iomanip>
@@ -90,6 +96,9 @@ struct vk_pipeline_struct {
     bool needed {};
     // set to true when the shader has been compiled
     bool compiled {};
+    // number of registers used, extracted from pipeline executable properties
+    uint32_t register_count {};
+    std::vector<uint32_t> specialization_constants;
 };
 
 typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@@ -184,6 +193,8 @@ struct vk_device_struct {
     uint32_t coopmat_k;
     bool coopmat2;
 
+    bool pipeline_executable_properties_support {};
+
     size_t idx;
 
     bool mul_mat_l[GGML_TYPE_COUNT];
@@ -893,6 +904,20 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
     }
     pipeline->compiled = true;
 
+    if (device->pipeline_executable_properties_support) {
+        vk::PipelineExecutableInfoKHR executableInfo;
+        executableInfo.pipeline = pipeline->pipeline;
+
+        auto statistics = device->device.getPipelineExecutableStatisticsKHR(executableInfo);
+        for (auto & s : statistics) {
+            VK_LOG_DEBUG(pipeline->name << " " << s.name << ": " << s.value.u64);
+            // "Register Count" is reported by NVIDIA drivers.
+            if (strcmp(s.name, "Register Count") == 0) {
+                pipeline->register_count = (uint32_t)s.value.u64;
+            }
+        }
+    }
+
     {
         std::lock_guard<std::mutex> guard(device->mutex);
         device->pipelines.insert({ pipeline->name, pipeline });
@@ -1581,6 +1606,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
             pipeline->push_constant_size = push_constant_size;
             pipeline->wg_denoms = wg_denoms;
             pipeline->align = align;
+            pipeline->specialization_constants = specialization_constants;
         }
 
         if (!pipeline->needed || pipeline->compiled) {
@@ -2289,6 +2315,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
         bool amd_shader_core_properties2 = false;
         bool pipeline_robustness = false;
         bool coopmat2_support = false;
+        bool pipeline_executable_properties_support = false;
         device->coopmat_support = false;
 
         // Check if maintenance4 is supported
@@ -2316,6 +2343,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
             } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
                        !getenv("GGML_VK_DISABLE_COOPMAT2")) {
                 coopmat2_support = true;
+            } else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) {
+                pipeline_executable_properties_support = true;
             }
         }
 
@@ -2500,8 +2529,18 @@ static vk_device ggml_vk_get_device(size_t idx) {
             device_extensions.push_back("VK_KHR_maintenance4");
         }
 
+        VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {};
+        pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR;
+        if (pipeline_executable_properties_support) {
+            last_struct->pNext = (VkBaseOutStructure *)&pep_features;
+            last_struct = (VkBaseOutStructure *)&pep_features;
+            device_extensions.push_back("VK_KHR_pipeline_executable_properties");
+        }
+
         vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
 
+        device->pipeline_executable_properties_support = pipeline_executable_properties_support;
+
         device->fp16 = device->fp16 && vk12_features.shaderFloat16;
 
         device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
@@ -2876,6 +2915,9 @@ static void ggml_vk_instance_init() {
     }
     VK_LOG_DEBUG("ggml_vk_instance_init()");
 
+    // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+
     uint32_t api_version = vk::enumerateInstanceVersion();
 
     if (api_version < VK_API_VERSION_1_2) {
@@ -2928,6 +2970,9 @@ static void ggml_vk_instance_init() {
     vk_instance.instance = vk::createInstance(instance_create_info);
     vk_instance_initialized = true;
 
+    // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
+
     size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
 
     // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -3832,12 +3877,21 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int
     VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
 
     uint32_t split_k = 1;
-    if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
-        // If k is 'large' and the SMs will fill less than halfway, use split_k.
+    if (ctx->device->shader_core_count != 0) {
         uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
         uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
-        if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
-            split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
+        uint32_t occupancy_factor = 1;
+        // Estimate how many workgroups can fit on an SM at a time.
+        // Other factors like shared memory could affect this, and aren't taken into account.
+        if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && pipeline->register_count > 0) {
+            uint32_t block_size = pipeline->specialization_constants[0];
+            assert(block_size > 0);
+            occupancy_factor = 65536 / (block_size * pipeline->register_count);
+        }
+        // The extra factor of 4 is to try to run up to 4x as many workgroups as can fit,
+        // to prefer shorter shaders that will be less prone to tail effects
+        if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count * occupancy_factor * 4) {
+            split_k = occupancy_factor * 4 * ctx->device->shader_core_count / (m_tiles * n_tiles);
             // Clamp to 2 or 4
             split_k = std::min(split_k, 4u);
             if (split_k == 3) {
@@ -4122,7 +4176,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     const int y_ne = padded_n * ne10;
     const int d_ne = ne11 * ne01;
 
-    const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
+    uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
 
     const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
     const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
@@ -4146,10 +4200,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
     GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
 
+    const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
     if (dryrun) {
         const uint64_t x_sz_upd = x_sz * ne02 * ne03;
         const uint64_t y_sz_upd = y_sz * ne12 * ne13;
-        const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
         if (
                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
@@ -4174,11 +4228,19 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
         if (qy_needs_dequant) {
             ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
         }
-        if (split_k > 1) {
+        // ggml_vk_guess_split_k may make a different determination after the pipeline
+        // is compiled (based on register count), so prepare for split_k just in case.
+        if (split_k > 1 || !pipeline->compiled) {
             ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
         }
         return;
     }
+    // ggml_vk_guess_split_k may make a different determination after the pipeline
+    // is compiled (based on register count). Fallback to no split_k if we didn't
+    // reserve enough memory.
+    if (split_k_size > ctx->prealloc_size_split_k) {
+        split_k = 1;
+    }
 
     vk_buffer d_D = dst_buf_ctx->dev_buffer;
     const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;