55#include " ggml-cpu.h"
66#endif
77
8+ // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
9+ #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
10+
811#include < vulkan/vulkan.hpp>
912
13+ // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
14+ VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
15+
1016#include < algorithm>
1117#include < cmath>
1218#include < iomanip>
@@ -90,6 +96,9 @@ struct vk_pipeline_struct {
9096 bool needed {};
9197 // set to true when the shader has been compiled
9298 bool compiled {};
99+ // number of registers used, extracted from pipeline executable properties
100+ uint32_t register_count {};
101+ std::vector<uint32_t > specialization_constants;
93102};
94103
95104typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@@ -184,6 +193,8 @@ struct vk_device_struct {
184193 uint32_t coopmat_k;
185194 bool coopmat2;
186195
196+ bool pipeline_executable_properties_support {};
197+
187198 size_t idx;
188199
189200 bool mul_mat_l[GGML_TYPE_COUNT];
@@ -893,6 +904,20 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
893904 }
894905 pipeline->compiled = true ;
895906
907+ if (device->pipeline_executable_properties_support ) {
908+ vk::PipelineExecutableInfoKHR executableInfo;
909+ executableInfo.pipeline = pipeline->pipeline ;
910+
911+ auto statistics = device->device .getPipelineExecutableStatisticsKHR (executableInfo);
912+ for (auto & s : statistics) {
913+ VK_LOG_DEBUG (pipeline->name << " " << s.name << " : " << s.value .u64 );
914+ // "Register Count" is reported by NVIDIA drivers.
915+ if (strcmp (s.name , " Register Count" ) == 0 ) {
916+ pipeline->register_count = (uint32_t )s.value .u64 ;
917+ }
918+ }
919+ }
920+
896921 {
897922 std::lock_guard<std::mutex> guard (device->mutex );
898923 device->pipelines .insert ({ pipeline->name , pipeline });
@@ -1581,6 +1606,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
15811606 pipeline->push_constant_size = push_constant_size;
15821607 pipeline->wg_denoms = wg_denoms;
15831608 pipeline->align = align;
1609+ pipeline->specialization_constants = specialization_constants;
15841610 }
15851611
15861612 if (!pipeline->needed || pipeline->compiled ) {
@@ -2289,6 +2315,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
22892315 bool amd_shader_core_properties2 = false ;
22902316 bool pipeline_robustness = false ;
22912317 bool coopmat2_support = false ;
2318+ bool pipeline_executable_properties_support = false ;
22922319 device->coopmat_support = false ;
22932320
22942321 // Check if maintenance4 is supported
@@ -2316,6 +2343,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
23162343 } else if (strcmp (" VK_NV_cooperative_matrix2" , properties.extensionName ) == 0 &&
23172344 !getenv (" GGML_VK_DISABLE_COOPMAT2" )) {
23182345 coopmat2_support = true ;
2346+ } else if (strcmp (" VK_KHR_pipeline_executable_properties" , properties.extensionName ) == 0 ) {
2347+ pipeline_executable_properties_support = true ;
23192348 }
23202349 }
23212350
@@ -2500,8 +2529,18 @@ static vk_device ggml_vk_get_device(size_t idx) {
25002529 device_extensions.push_back (" VK_KHR_maintenance4" );
25012530 }
25022531
2532+ VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {};
2533+ pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR;
2534+ if (pipeline_executable_properties_support) {
2535+ last_struct->pNext = (VkBaseOutStructure *)&pep_features;
2536+ last_struct = (VkBaseOutStructure *)&pep_features;
2537+ device_extensions.push_back (" VK_KHR_pipeline_executable_properties" );
2538+ }
2539+
25032540 vkGetPhysicalDeviceFeatures2 (device->physical_device , &device_features2);
25042541
2542+ device->pipeline_executable_properties_support = pipeline_executable_properties_support;
2543+
25052544 device->fp16 = device->fp16 && vk12_features.shaderFloat16 ;
25062545
25072546 device->pipeline_robustness = pl_robustness_features.pipelineRobustness ;
@@ -2876,6 +2915,9 @@ static void ggml_vk_instance_init() {
28762915 }
28772916 VK_LOG_DEBUG (" ggml_vk_instance_init()" );
28782917
2918+ // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
2919+ VULKAN_HPP_DEFAULT_DISPATCHER.init (vkGetInstanceProcAddr);
2920+
28792921 uint32_t api_version = vk::enumerateInstanceVersion ();
28802922
28812923 if (api_version < VK_API_VERSION_1_2) {
@@ -2928,6 +2970,9 @@ static void ggml_vk_instance_init() {
29282970 vk_instance.instance = vk::createInstance (instance_create_info);
29292971 vk_instance_initialized = true ;
29302972
2973+ // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
2974+ VULKAN_HPP_DEFAULT_DISPATCHER.init (vk_instance.instance );
2975+
29312976 size_t num_available_devices = vk_instance.instance .enumeratePhysicalDevices ().size ();
29322977
29332978 // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -3832,12 +3877,21 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int
38323877 VK_LOG_DEBUG (" ggml_vk_guess_split_k(" << m << " , " << n << " , " << k << " )" );
38333878
38343879 uint32_t split_k = 1 ;
3835- if (ctx->device ->shader_core_count != 0 && m >= (int )pipeline->wg_denoms [0 ] && n >= (int )pipeline->wg_denoms [1 ]) {
3836- // If k is 'large' and the SMs will fill less than halfway, use split_k.
3880+ if (ctx->device ->shader_core_count != 0 ) {
38373881 uint32_t m_tiles = CEIL_DIV (m, pipeline->wg_denoms [0 ]);
38383882 uint32_t n_tiles = CEIL_DIV (n, pipeline->wg_denoms [1 ]);
3839- if (k >= 2048 && m_tiles * n_tiles < ctx->device ->shader_core_count / 2 ) {
3840- split_k = ctx->device ->shader_core_count / (m_tiles * n_tiles);
3883+ uint32_t occupancy_factor = 1 ;
3884+ // Estimate how many workgroups can fit on an SM at a time.
3885+ // Other factors like shared memory could affect this, and aren't taken into account.
3886+ if (ctx->device ->vendor_id == VK_VENDOR_ID_NVIDIA && pipeline->register_count > 0 ) {
3887+ uint32_t block_size = pipeline->specialization_constants [0 ];
3888+ assert (block_size > 0 );
3889+ occupancy_factor = 65536 / (block_size * pipeline->register_count );
3890+ }
3891+ // The extra factor of 4 is to try to run up to 4x as many workgroups as can fit,
3892+ // to prefer shorter shaders that will be less prone to tail effects
3893+ if (k >= 2048 && m_tiles * n_tiles < ctx->device ->shader_core_count * occupancy_factor * 4 ) {
3894+ split_k = occupancy_factor * 4 * ctx->device ->shader_core_count / (m_tiles * n_tiles);
38413895 // Clamp to 2 or 4
38423896 split_k = std::min (split_k, 4u );
38433897 if (split_k == 3 ) {
@@ -4122,7 +4176,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
41224176 const int y_ne = padded_n * ne10;
41234177 const int d_ne = ne11 * ne01;
41244178
4125- const uint32_t split_k = ggml_vk_guess_split_k (ctx, ne01, ne11, ne10, pipeline);
4179+ uint32_t split_k = ggml_vk_guess_split_k (ctx, ne01, ne11, ne10, pipeline);
41264180
41274181 const uint64_t qx_sz = ggml_type_size (src0->type ) * x_ne / ggml_blck_size (src0->type );
41284182 const uint64_t qy_sz = ggml_type_size (src1->type ) * y_ne / ggml_blck_size (src1->type );
@@ -4146,10 +4200,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
41464200 GGML_ASSERT (!qx_needs_dequant || to_fp16_vk_0 != nullptr ); // NOLINT
41474201 GGML_ASSERT (!qy_needs_dequant || to_fp16_vk_1 != nullptr ); // NOLINT
41484202
4203+ const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0 ;
41494204 if (dryrun) {
41504205 const uint64_t x_sz_upd = x_sz * ne02 * ne03;
41514206 const uint64_t y_sz_upd = y_sz * ne12 * ne13;
4152- const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0 ;
41534207 if (
41544208 (qx_needs_dequant && x_sz_upd > ctx->device ->max_memory_allocation_size ) ||
41554209 (qy_needs_dequant && y_sz_upd > ctx->device ->max_memory_allocation_size ) ||
@@ -4174,11 +4228,19 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
41744228 if (qy_needs_dequant) {
41754229 ggml_pipeline_request_descriptor_sets (ctx->device , to_fp16_vk_1, 1 );
41764230 }
4177- if (split_k > 1 ) {
4231+ // ggml_vk_guess_split_k may make a different determination after the pipeline
4232+ // is compiled (based on register count), so prepare for split_k just in case.
4233+ if (split_k > 1 || !pipeline->compiled ) {
41784234 ggml_pipeline_request_descriptor_sets (ctx->device , ctx->device ->pipeline_matmul_split_k_reduce , 1 );
41794235 }
41804236 return ;
41814237 }
4238+ // ggml_vk_guess_split_k may make a different determination after the pipeline
4239+ // is compiled (based on register count). Fallback to no split_k if we didn't
4240+ // reserve enough memory.
4241+ if (split_k_size > ctx->prealloc_size_split_k ) {
4242+ split_k = 1 ;
4243+ }
41824244
41834245 vk_buffer d_D = dst_buf_ctx->dev_buffer ;
41844246 const uint64_t d_buf_offset = vk_tensor_offset (dst) + dst->view_offs ;
0 commit comments