From 53b69a31be05fda1c9124be3999c4cc4e05cd018 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Sat, 8 Mar 2025 22:00:51 +0100 Subject: [PATCH 01/10] vulkan: subgroup size test --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 53 +++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ff53bdfbe171c..c43c10c37604e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1445,6 +1445,49 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec return supported; } +struct GpuPipelineConfig { + // List of all aliases for a given GPU. + // For example, this can include names like "NAVI10", "RX 5700", etc. + std::vector device_names; + + // Mapping of pipeline names to their specific subgroup sizes. + // Example: {"soft_max_f32", 64}. + std::unordered_map pipelines; + + // Default subgroup size for this GPU. + // Defaults to 0 if not explicitly provided. + uint32_t default_subgroup_size = 0; +}; + +// Define configurations for different GPUs. +static std::vector gpu_pipeline_configs = { + { + {"NAVI10", "NAVI14", "RX 5700", "RX 5600", "RX 5500"}, + { + {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64}, + {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64}, + {"im2col_f32", 64}, {"im2col_f32_f16", 64}, + }, + 32 + }, +}; + +static uint32_t get_subgroup_size(const std::string &pipeline_name, const std::string &device_name) { + for (const auto &config : gpu_pipeline_configs) { + for (const auto &alias : config.device_names) { + if (device_name.find(alias) != std::string::npos) { + auto pipIt = config.pipelines.find(pipeline_name); + if (pipIt != config.pipelines.end() && pipIt->second != 0) { + return pipIt->second; + } + return config.default_subgroup_size; + } + } + } + // If no matching configuration is found, return 0. + return 0; +} + static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); @@ -1566,11 +1609,17 @@ static void ggml_vk_load_shaders(vk_device& device) { device->pipeline_matmul_id_f32 = std::make_shared(); } + vk::PhysicalDeviceProperties2 props2; + device->physical_device.getProperties2(&props2); + std::string device_name = props2.properties.deviceName.data(); + std::vector> compiles; auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { + required_subgroup_size = get_subgroup_size(name, device_name); + if (!pipeline) { pipeline = std::make_shared(); pipeline->name = name; @@ -2779,7 +2828,9 @@ static void ggml_vk_print_gpu_info(size_t idx) { subgroup_props.pNext = &driver_props; physical_device.getProperties2(&props2); - const size_t subgroup_size = subgroup_props.subgroupSize; + uint32_t default_subgroup_size = get_subgroup_size("", props2.properties.deviceName.data()); + const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize; + const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; bool fp16_storage = false; From 85e15e6bc832760506ac1d1033195cd3c032e127 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 8 Mar 2025 21:01:27 +0100 Subject: [PATCH 02/10] Vulkan: Add device architecture enum and logic to recognize AMD generations --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 79 ++++++++++++++++++++++++---- 1 file changed, 70 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c43c10c37604e..49bec2257d67e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -149,6 +149,66 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf); static constexpr uint32_t mul_mat_vec_max_cols = 8; +enum vk_device_architecture { + OTHER, + AMD_GCN, + AMD_RDNA1, + AMD_RDNA2, + AMD_RDNA3, +}; + +static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { + vk::PhysicalDeviceProperties props = device.getProperties(); + + if (props.vendorID == VK_VENDOR_ID_AMD) { + const std::vector ext_props = device.enumerateDeviceExtensionProperties(); + + bool amd_shader_core_properties = false; + bool integer_dot_product = false; + bool subgroup_size_control = false; + + for (const auto& properties : ext_props) { + if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) { + amd_shader_core_properties = true; + } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) { + integer_dot_product = true; + } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { + subgroup_size_control = true; + } + } + + if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) { + return vk_device_architecture::OTHER; + } + + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd; + vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props; + vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + + props2.pNext = &shader_core_props_amd; + shader_core_props_amd.pNext = &integer_dot_props; + integer_dot_props.pNext = &subgroup_size_control_props; + + device.getProperties2(&props2); + + if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) { + return vk_device_architecture::AMD_GCN; + } + if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) { + // RDNA + if (shader_core_props_amd.wavefrontsPerSimd == 20) { + return vk_device_architecture::AMD_RDNA1; + } + if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated) { + return vk_device_architecture::AMD_RDNA3; + } + return vk_device_architecture::AMD_RDNA2; + } + } + return vk_device_architecture::OTHER; +} + struct vk_device_struct { std::mutex mutex; @@ -161,6 +221,7 @@ struct vk_device_struct { bool pipeline_robustness; vk::Device device; uint32_t vendor_id; + vk_device_architecture architecture; vk_queue compute_queue; vk_queue transfer_queue; bool single_queue; @@ -2296,7 +2357,7 @@ static void ggml_vk_load_shaders(vk_device& device) { device->need_compiles = false; } -static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props); +static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch); static vk_device ggml_vk_get_device(size_t idx) { VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")"); @@ -2325,6 +2386,8 @@ static vk_device ggml_vk_get_device(size_t idx) { device->physical_device = physical_devices[dev_num]; const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); + device->architecture = get_device_architecture(device->physical_device); + const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY"); device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr; @@ -2337,7 +2400,6 @@ static vk_device ggml_vk_get_device(size_t idx) { bool coopmat2_support = false; device->coopmat_support = false; - // Check if maintenance4 is supported for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; @@ -2450,7 +2512,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; - if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) { + if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props, device->architecture)) { device->coopmat_support = false; } @@ -2856,7 +2918,9 @@ static void ggml_vk_print_gpu_info(size_t idx) { } } - if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) { + const vk_device_architecture device_architecture = get_device_architecture(physical_device); + + if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture)) { coopmat_support = false; } @@ -8877,7 +8941,7 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve UNUSED(instance_extensions); } -static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) { +static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: // Intel drivers don't support coopmat properly yet @@ -8885,10 +8949,7 @@ static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDevicePrope case VK_VENDOR_ID_AMD: if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { // Workaround for AMD proprietary driver reporting support on all GPUs - const std::string name = props.deviceName; - return name.rfind("AMD Radeon RX 7", 0) == 0 || name.rfind("AMD Radeon(TM) RX 7", 0) == 0 || // RDNA 3 consumer GPUs - name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs - name.rfind("AMD Radeon 7", 0) == 0 || name.rfind("AMD Radeon(TM) 7", 0) == 0; // RDNA 3 APUs + return arch == vk_device_architecture::AMD_RDNA3; } return true; default: From 7695541471209259fb5727ec9450db21cb217b16 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Sat, 8 Mar 2025 22:00:58 +0100 Subject: [PATCH 03/10] vulkan: use new architecture logic to specify subgroup size --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 33 ++++++++++++---------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 49bec2257d67e..ffef952dd1321 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1507,12 +1507,12 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec } struct GpuPipelineConfig { - // List of all aliases for a given GPU. - // For example, this can include names like "NAVI10", "RX 5700", etc. - std::vector device_names; + // GPU architecture identifier. + // Example: vk_device_architecture::AMD_GCN + vk_device_architecture arch; // Mapping of pipeline names to their specific subgroup sizes. - // Example: {"soft_max_f32", 64}. + // Example: {"soft_max_f32", 64} std::unordered_map pipelines; // Default subgroup size for this GPU. @@ -1523,7 +1523,7 @@ struct GpuPipelineConfig { // Define configurations for different GPUs. static std::vector gpu_pipeline_configs = { { - {"NAVI10", "NAVI14", "RX 5700", "RX 5600", "RX 5500"}, + vk_device_architecture::AMD_RDNA1, { {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64}, {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64}, @@ -1533,16 +1533,14 @@ static std::vector gpu_pipeline_configs = { }, }; -static uint32_t get_subgroup_size(const std::string &pipeline_name, const std::string &device_name) { +static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) { for (const auto &config : gpu_pipeline_configs) { - for (const auto &alias : config.device_names) { - if (device_name.find(alias) != std::string::npos) { - auto pipIt = config.pipelines.find(pipeline_name); - if (pipIt != config.pipelines.end() && pipIt->second != 0) { - return pipIt->second; - } - return config.default_subgroup_size; + if (config.arch == arch) { + auto pipIt = config.pipelines.find(pipeline_name); + if (pipIt != config.pipelines.end() && pipIt->second != 0) { + return pipIt->second; } + return config.default_subgroup_size; } } // If no matching configuration is found, return 0. @@ -1670,16 +1668,12 @@ static void ggml_vk_load_shaders(vk_device& device) { device->pipeline_matmul_id_f32 = std::make_shared(); } - vk::PhysicalDeviceProperties2 props2; - device->physical_device.getProperties2(&props2); - std::string device_name = props2.properties.deviceName.data(); - std::vector> compiles; auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { - required_subgroup_size = get_subgroup_size(name, device_name); + required_subgroup_size = get_subgroup_size(name, device->architecture); if (!pipeline) { pipeline = std::make_shared(); @@ -2890,7 +2884,8 @@ static void ggml_vk_print_gpu_info(size_t idx) { subgroup_props.pNext = &driver_props; physical_device.getProperties2(&props2); - uint32_t default_subgroup_size = get_subgroup_size("", props2.properties.deviceName.data()); + vk_device_architecture arch = get_device_architecture(physical_device); + uint32_t default_subgroup_size = get_subgroup_size("", arch); const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize; const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; From 43c3e6fd6d8177f6bd27d1b978dae7f3509762cb Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Tue, 11 Mar 2025 22:36:06 +0100 Subject: [PATCH 04/10] Initial vulkan subgroup size tuning for RDNA3 --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ffef952dd1321..31072e57447d9 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1531,6 +1531,15 @@ static std::vector gpu_pipeline_configs = { }, 32 }, + { + vk_device_architecture::AMD_RDNA3, + { + {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64}, + {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64}, + {"im2col_f32", 64}, {"im2col_f32_f16", 64}, + }, + 32 + }, }; static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) { @@ -1673,7 +1682,9 @@ static void ggml_vk_load_shaders(vk_device& device) { uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { - required_subgroup_size = get_subgroup_size(name, device->architecture); + if (!require_full_subgroups) { + required_subgroup_size = get_subgroup_size(name, device->architecture); + } if (!pipeline) { pipeline = std::make_shared(); From c41619d0b0621e4ab6fd6f1f258ab9bf8a669159 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Wed, 12 Mar 2025 15:24:34 +0100 Subject: [PATCH 05/10] vulkan: commonize RDNA subgroup tuning --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 31072e57447d9..687eb53b22a35 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1520,25 +1520,36 @@ struct GpuPipelineConfig { uint32_t default_subgroup_size = 0; }; +// Common pipeline configuration for RDNA GPUs. +static const std::unordered_map rdna_pipelines = { + {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64}, + {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64}, + {"im2col_f32", 64}, {"im2col_f32_f16", 64}, +}; +static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32; + // Define configurations for different GPUs. static std::vector gpu_pipeline_configs = { { vk_device_architecture::AMD_RDNA1, { - {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64}, - {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64}, - {"im2col_f32", 64}, {"im2col_f32_f16", 64}, + rdna_pipelines, + }, + RDNA_DEFAULT_SUBGROUP_SIZE + }, + { + vk_device_architecture::AMD_RDNA2, + { + rdna_pipelines, }, - 32 + RDNA_DEFAULT_SUBGROUP_SIZE }, { vk_device_architecture::AMD_RDNA3, { - {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64}, - {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64}, - {"im2col_f32", 64}, {"im2col_f32_f16", 64}, + rdna_pipelines, }, - 32 + RDNA_DEFAULT_SUBGROUP_SIZE }, }; From 1c17520f051cbed22ce7879257521ed841d57c6a Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Wed, 12 Mar 2025 17:07:57 +0100 Subject: [PATCH 06/10] vulkan: override subgroup size if required_subgroup_size = 0 --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 687eb53b22a35..5aef01ebc08f4 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1693,7 +1693,7 @@ static void ggml_vk_load_shaders(vk_device& device) { uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { - if (!require_full_subgroups) { + if (!require_full_subgroups && required_subgroup_size == 0) { required_subgroup_size = get_subgroup_size(name, device->architecture); } From afb5c2dc0d1a025f348c7f47ba6d3f771f4c24a5 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Wed, 12 Mar 2025 17:55:11 +0100 Subject: [PATCH 07/10] vulkan: disable warp 32 for RDNA3 --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 5aef01ebc08f4..92953c32902ab 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1544,13 +1544,6 @@ static std::vector gpu_pipeline_configs = { }, RDNA_DEFAULT_SUBGROUP_SIZE }, - { - vk_device_architecture::AMD_RDNA3, - { - rdna_pipelines, - }, - RDNA_DEFAULT_SUBGROUP_SIZE - }, }; static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) { From 29e810490cddb1b00b0b1aeb29814a19d3dd37c8 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Thu, 13 Mar 2025 01:45:41 +0100 Subject: [PATCH 08/10] vulkan: fine tuned RDNA1 subgroup sizes --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 31 ++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 92953c32902ab..a6c13ff2164b3 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1521,11 +1521,17 @@ struct GpuPipelineConfig { }; // Common pipeline configuration for RDNA GPUs. -static const std::unordered_map rdna_pipelines = { - {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64}, - {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64}, - {"im2col_f32", 64}, {"im2col_f32_f16", 64}, +static const std::unordered_map rdna_common_pipelines = { + {"soft_max", 64}, {"im2col", 64}, }; + +// RDNA1 pipeline configuration. +static std::unordered_map rdna1_pipelines = rdna_common_pipelines; +static const bool rdna1_initialized = (rdna1_pipelines.insert({ + {"argmax", 64}, {"mul_mat_vec", 64}, + {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32} +}), true); + static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32; // Define configurations for different GPUs. @@ -1533,14 +1539,14 @@ static std::vector gpu_pipeline_configs = { { vk_device_architecture::AMD_RDNA1, { - rdna_pipelines, + rdna1_pipelines, }, RDNA_DEFAULT_SUBGROUP_SIZE }, { vk_device_architecture::AMD_RDNA2, { - rdna_pipelines, + rdna_common_pipelines, }, RDNA_DEFAULT_SUBGROUP_SIZE }, @@ -1550,14 +1556,21 @@ static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_dev for (const auto &config : gpu_pipeline_configs) { if (config.arch == arch) { auto pipIt = config.pipelines.find(pipeline_name); - if (pipIt != config.pipelines.end() && pipIt->second != 0) { + if (pipIt != config.pipelines.end()) { return pipIt->second; } + std::vector> sorted_pipelines(config.pipelines.begin(), config.pipelines.end()); + std::sort(sorted_pipelines.begin(), sorted_pipelines.end(), + [](const auto &a, const auto &b) { return a.first.size() > b.first.size(); }); + for (const auto &entry : sorted_pipelines) { + if (pipeline_name.find(entry.first) != std::string::npos) { + return entry.second; + } + } return config.default_subgroup_size; } } - // If no matching configuration is found, return 0. - return 0; + return 0; // If no matching configuration is found } static void ggml_vk_load_shaders(vk_device& device) { From bf7352e1b7f0cde387c08cb014323fd9f1b95241 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Sun, 16 Mar 2025 16:38:14 +0100 Subject: [PATCH 09/10] vulkan: adjusted subgroup size map --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index a6c13ff2164b3..2bc651230578e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1520,17 +1520,18 @@ struct GpuPipelineConfig { uint32_t default_subgroup_size = 0; }; -// Common pipeline configuration for RDNA GPUs. -static const std::unordered_map rdna_common_pipelines = { +// Pipeline configuration for RDNA1 GPUs. +static const std::unordered_map rdna1_pipelines = { {"soft_max", 64}, {"im2col", 64}, -}; - -// RDNA1 pipeline configuration. -static std::unordered_map rdna1_pipelines = rdna_common_pipelines; -static const bool rdna1_initialized = (rdna1_pipelines.insert({ {"argmax", 64}, {"mul_mat_vec", 64}, {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32} -}), true); +}; + +// Pipeline configuration for RDNA2 GPUs. +static const std::unordered_map rdna2_pipelines = { + {"soft_max", 64}, {"im2col", 64}, + {"argmax", 64}, +}; static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32; @@ -1546,7 +1547,7 @@ static std::vector gpu_pipeline_configs = { { vk_device_architecture::AMD_RDNA2, { - rdna_common_pipelines, + rdna2_pipelines, }, RDNA_DEFAULT_SUBGROUP_SIZE }, From d43537ad736ee0cd5a12cc15fa88afa12a343251 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Sun, 16 Mar 2025 16:41:28 +0100 Subject: [PATCH 10/10] vulkan: fixed RDNA2 subgroup map --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 2bc651230578e..3133977562c85 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1530,7 +1530,6 @@ static const std::unordered_map rdna1_pipelines = { // Pipeline configuration for RDNA2 GPUs. static const std::unordered_map rdna2_pipelines = { {"soft_max", 64}, {"im2col", 64}, - {"argmax", 64}, }; static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32;