From 53b69a31be05fda1c9124be3999c4cc4e05cd018 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Sat, 8 Mar 2025 22:00:51 +0100
Subject: [PATCH 01/10] vulkan: subgroup size test

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 53 +++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ff53bdfbe171c..c43c10c37604e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1445,6 +1445,49 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
     return supported;
 }
 
+struct GpuPipelineConfig {
+    // List of all aliases for a given GPU.
+    // For example, this can include names like "NAVI10", "RX 5700", etc.
+    std::vector<std::string> device_names;
+
+    // Mapping of pipeline names to their specific subgroup sizes.
+    // Example: {"soft_max_f32", 64}.
+    std::unordered_map<std::string, uint32_t> pipelines;
+
+    // Default subgroup size for this GPU.
+    // Defaults to 0 if not explicitly provided.
+    uint32_t default_subgroup_size = 0;
+};
+
+// Define configurations for different GPUs.
+static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
+    {
+        {"NAVI10", "NAVI14", "RX 5700", "RX 5600", "RX 5500"},
+        {
+            {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
+            {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
+            {"im2col_f32", 64}, {"im2col_f32_f16", 64},
+        },
+        32
+    },
+};
+
+static uint32_t get_subgroup_size(const std::string &pipeline_name, const std::string &device_name) {
+    for (const auto &config : gpu_pipeline_configs) {
+        for (const auto &alias : config.device_names) {
+            if (device_name.find(alias) != std::string::npos) {
+                auto pipIt = config.pipelines.find(pipeline_name);
+                if (pipIt != config.pipelines.end() && pipIt->second != 0) {
+                    return pipIt->second;
+                }
+                return config.default_subgroup_size;
+            }
+        }
+    }
+    // If no matching configuration is found, return 0.
+    return 0;
+}
+
 static void ggml_vk_load_shaders(vk_device& device) {
     VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
 
@@ -1566,11 +1609,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
         device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
     }
 
+    vk::PhysicalDeviceProperties2 props2;
+    device->physical_device.getProperties2(&props2);
+    std::string device_name = props2.properties.deviceName.data();
+
     std::vector<std::future<void>> compiles;
     auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
                                               uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
                                               uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
 
+        required_subgroup_size = get_subgroup_size(name, device_name);
+
         if (!pipeline) {
             pipeline = std::make_shared<vk_pipeline_struct>();
             pipeline->name = name;
@@ -2779,7 +2828,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
     subgroup_props.pNext = &driver_props;
     physical_device.getProperties2(&props2);
 
-    const size_t subgroup_size = subgroup_props.subgroupSize;
+    uint32_t default_subgroup_size = get_subgroup_size("", props2.properties.deviceName.data());
+    const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
+
     const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
 
     bool fp16_storage = false;

From 85e15e6bc832760506ac1d1033195cd3c032e127 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 8 Mar 2025 21:01:27 +0100
Subject: [PATCH 02/10] Vulkan: Add device architecture enum and logic to
 recognize AMD generations

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 79 ++++++++++++++++++++++++----
 1 file changed, 70 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c43c10c37604e..49bec2257d67e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -149,6 +149,66 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf);
 
 static constexpr uint32_t mul_mat_vec_max_cols = 8;
 
+enum vk_device_architecture {
+    OTHER,
+    AMD_GCN,
+    AMD_RDNA1,
+    AMD_RDNA2,
+    AMD_RDNA3,
+};
+
+static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
+    vk::PhysicalDeviceProperties props = device.getProperties();
+
+    if (props.vendorID == VK_VENDOR_ID_AMD) {
+        const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
+
+        bool amd_shader_core_properties = false;
+        bool integer_dot_product = false;
+        bool subgroup_size_control = false;
+
+        for (const auto& properties : ext_props) {
+            if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) {
+                amd_shader_core_properties = true;
+            } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) {
+                integer_dot_product = true;
+            } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
+                subgroup_size_control = true;
+            }
+        }
+
+        if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) {
+            return vk_device_architecture::OTHER;
+        }
+
+        vk::PhysicalDeviceProperties2 props2;
+        vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd;
+        vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
+        vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
+
+        props2.pNext = &shader_core_props_amd;
+        shader_core_props_amd.pNext = &integer_dot_props;
+        integer_dot_props.pNext = &subgroup_size_control_props;
+
+        device.getProperties2(&props2);
+
+        if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) {
+            return vk_device_architecture::AMD_GCN;
+        }
+        if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) {
+            // RDNA
+            if (shader_core_props_amd.wavefrontsPerSimd == 20) {
+                return vk_device_architecture::AMD_RDNA1;
+            }
+            if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated) {
+                return vk_device_architecture::AMD_RDNA3;
+            }
+            return vk_device_architecture::AMD_RDNA2;
+        }
+    }
+    return vk_device_architecture::OTHER;
+}
+
 struct vk_device_struct {
     std::mutex mutex;
 
@@ -161,6 +221,7 @@ struct vk_device_struct {
     bool pipeline_robustness;
     vk::Device device;
     uint32_t vendor_id;
+    vk_device_architecture architecture;
     vk_queue compute_queue;
     vk_queue transfer_queue;
     bool single_queue;
@@ -2296,7 +2357,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     device->need_compiles = false;
 }
 
-static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
+static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
 
 static vk_device ggml_vk_get_device(size_t idx) {
     VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
@@ -2325,6 +2386,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
         device->physical_device = physical_devices[dev_num];
         const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();
 
+        device->architecture = get_device_architecture(device->physical_device);
+
         const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY");
         device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr;
 
@@ -2337,7 +2400,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
         bool coopmat2_support = false;
         device->coopmat_support = false;
 
-        // Check if maintenance4 is supported
         for (const auto& properties : ext_props) {
             if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
                 maintenance4_support = true;
@@ -2450,7 +2512,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
 
         device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
 
-        if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) {
+        if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props, device->architecture)) {
             device->coopmat_support = false;
         }
 
@@ -2856,7 +2918,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
         }
     }
 
-    if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) {
+    const vk_device_architecture device_architecture = get_device_architecture(physical_device);
+
+    if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture)) {
         coopmat_support = false;
     }
 
@@ -8877,7 +8941,7 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
     UNUSED(instance_extensions);
 }
 
-static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) {
+static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
     switch (props.vendorID) {
     case VK_VENDOR_ID_INTEL:
         // Intel drivers don't support coopmat properly yet
@@ -8885,10 +8949,7 @@ static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDevicePrope
     case VK_VENDOR_ID_AMD:
         if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
             // Workaround for AMD proprietary driver reporting support on all GPUs
-            const std::string name = props.deviceName;
-            return name.rfind("AMD Radeon RX 7", 0) == 0   || name.rfind("AMD Radeon(TM) RX 7", 0) == 0   || // RDNA 3 consumer GPUs
-                   name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs
-                   name.rfind("AMD Radeon 7", 0) == 0      || name.rfind("AMD Radeon(TM) 7", 0) == 0;        // RDNA 3 APUs
+            return arch == vk_device_architecture::AMD_RDNA3;
         }
         return true;
     default:

From 7695541471209259fb5727ec9450db21cb217b16 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Sat, 8 Mar 2025 22:00:58 +0100
Subject: [PATCH 03/10] vulkan: use new architecture logic to specify subgroup
 size

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 33 ++++++++++++----------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 49bec2257d67e..ffef952dd1321 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1507,12 +1507,12 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
 }
 
 struct GpuPipelineConfig {
-    // List of all aliases for a given GPU.
-    // For example, this can include names like "NAVI10", "RX 5700", etc.
-    std::vector<std::string> device_names;
+    // GPU architecture identifier.
+    // Example: vk_device_architecture::AMD_GCN
+    vk_device_architecture arch;
 
     // Mapping of pipeline names to their specific subgroup sizes.
-    // Example: {"soft_max_f32", 64}.
+    // Example: {"soft_max_f32", 64}
     std::unordered_map<std::string, uint32_t> pipelines;
 
     // Default subgroup size for this GPU.
@@ -1523,7 +1523,7 @@ struct GpuPipelineConfig {
 // Define configurations for different GPUs.
 static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
     {
-        {"NAVI10", "NAVI14", "RX 5700", "RX 5600", "RX 5500"},
+        vk_device_architecture::AMD_RDNA1,
         {
             {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
             {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
@@ -1533,16 +1533,14 @@ static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
     },
 };
 
-static uint32_t get_subgroup_size(const std::string &pipeline_name, const std::string &device_name) {
+static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) {
     for (const auto &config : gpu_pipeline_configs) {
-        for (const auto &alias : config.device_names) {
-            if (device_name.find(alias) != std::string::npos) {
-                auto pipIt = config.pipelines.find(pipeline_name);
-                if (pipIt != config.pipelines.end() && pipIt->second != 0) {
-                    return pipIt->second;
-                }
-                return config.default_subgroup_size;
+        if (config.arch == arch) {
+            auto pipIt = config.pipelines.find(pipeline_name);
+            if (pipIt != config.pipelines.end() && pipIt->second != 0) {
+                return pipIt->second;
             }
+            return config.default_subgroup_size;
         }
     }
     // If no matching configuration is found, return 0.
@@ -1670,16 +1668,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
         device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
     }
 
-    vk::PhysicalDeviceProperties2 props2;
-    device->physical_device.getProperties2(&props2);
-    std::string device_name = props2.properties.deviceName.data();
-
     std::vector<std::future<void>> compiles;
     auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
                                               uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
                                               uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
 
-        required_subgroup_size = get_subgroup_size(name, device_name);
+        required_subgroup_size = get_subgroup_size(name, device->architecture);
 
         if (!pipeline) {
             pipeline = std::make_shared<vk_pipeline_struct>();
@@ -2890,7 +2884,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
     subgroup_props.pNext = &driver_props;
     physical_device.getProperties2(&props2);
 
-    uint32_t default_subgroup_size = get_subgroup_size("", props2.properties.deviceName.data());
+    vk_device_architecture arch = get_device_architecture(physical_device);
+    uint32_t default_subgroup_size = get_subgroup_size("", arch);
     const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
 
     const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;

From 43c3e6fd6d8177f6bd27d1b978dae7f3509762cb Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Tue, 11 Mar 2025 22:36:06 +0100
Subject: [PATCH 04/10] Initial vulkan subgroup size tuning for RDNA3

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ffef952dd1321..31072e57447d9 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1531,6 +1531,15 @@ static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
         },
         32
     },
+    {
+        vk_device_architecture::AMD_RDNA3,
+        {
+            {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
+            {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
+            {"im2col_f32", 64}, {"im2col_f32_f16", 64},
+        },
+        32
+    },
 };
 
 static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) {
@@ -1673,7 +1682,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
                                               uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
                                               uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
 
-        required_subgroup_size = get_subgroup_size(name, device->architecture);
+        if (!require_full_subgroups) {
+            required_subgroup_size = get_subgroup_size(name, device->architecture);
+        }
 
         if (!pipeline) {
             pipeline = std::make_shared<vk_pipeline_struct>();

From c41619d0b0621e4ab6fd6f1f258ab9bf8a669159 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Wed, 12 Mar 2025 15:24:34 +0100
Subject: [PATCH 05/10] vulkan: commonize RDNA subgroup tuning

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 31072e57447d9..687eb53b22a35 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1520,25 +1520,36 @@ struct GpuPipelineConfig {
     uint32_t default_subgroup_size = 0;
 };
 
+// Common pipeline configuration for RDNA GPUs.
+static const std::unordered_map<std::string, uint32_t> rdna_pipelines = {
+    {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
+    {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
+    {"im2col_f32", 64}, {"im2col_f32_f16", 64},
+};
+static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32;
+
 // Define configurations for different GPUs.
 static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
     {
         vk_device_architecture::AMD_RDNA1,
         {
-            {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
-            {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
-            {"im2col_f32", 64}, {"im2col_f32_f16", 64},
+            rdna_pipelines,
+        },
+        RDNA_DEFAULT_SUBGROUP_SIZE
+    },
+    {
+        vk_device_architecture::AMD_RDNA2,
+        {
+            rdna_pipelines,
         },
-        32
+        RDNA_DEFAULT_SUBGROUP_SIZE
     },
     {
         vk_device_architecture::AMD_RDNA3,
         {
-            {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
-            {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
-            {"im2col_f32", 64}, {"im2col_f32_f16", 64},
+            rdna_pipelines,
         },
-        32
+        RDNA_DEFAULT_SUBGROUP_SIZE
     },
 };
 

From 1c17520f051cbed22ce7879257521ed841d57c6a Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Wed, 12 Mar 2025 17:07:57 +0100
Subject: [PATCH 06/10] vulkan: override subgroup size if
 required_subgroup_size = 0

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 687eb53b22a35..5aef01ebc08f4 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1693,7 +1693,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
                                               uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
                                               uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
 
-        if (!require_full_subgroups) {
+        if (!require_full_subgroups && required_subgroup_size == 0) {
             required_subgroup_size = get_subgroup_size(name, device->architecture);
         }
 

From afb5c2dc0d1a025f348c7f47ba6d3f771f4c24a5 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Wed, 12 Mar 2025 17:55:11 +0100
Subject: [PATCH 07/10] vulkan: disable warp 32 for RDNA3

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 5aef01ebc08f4..92953c32902ab 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1544,13 +1544,6 @@ static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
         },
         RDNA_DEFAULT_SUBGROUP_SIZE
     },
-    {
-        vk_device_architecture::AMD_RDNA3,
-        {
-            rdna_pipelines,
-        },
-        RDNA_DEFAULT_SUBGROUP_SIZE
-    },
 };
 
 static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) {

From 29e810490cddb1b00b0b1aeb29814a19d3dd37c8 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Thu, 13 Mar 2025 01:45:41 +0100
Subject: [PATCH 08/10] vulkan: fine tuned RDNA1 subgroup sizes

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 31 ++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 92953c32902ab..a6c13ff2164b3 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1521,11 +1521,17 @@ struct GpuPipelineConfig {
 };
 
 // Common pipeline configuration for RDNA GPUs.
-static const std::unordered_map<std::string, uint32_t> rdna_pipelines = {
-    {"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
-    {"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
-    {"im2col_f32", 64}, {"im2col_f32_f16", 64},
+static const std::unordered_map<std::string, uint32_t> rdna_common_pipelines = {
+    {"soft_max", 64}, {"im2col", 64},
 };
+
+// RDNA1 pipeline configuration.
+static std::unordered_map<std::string, uint32_t> rdna1_pipelines = rdna_common_pipelines;
+static const bool rdna1_initialized = (rdna1_pipelines.insert({
+    {"argmax", 64}, {"mul_mat_vec", 64},
+    {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32}
+}), true);
+
 static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32;
 
 // Define configurations for different GPUs.
@@ -1533,14 +1539,14 @@ static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
     {
         vk_device_architecture::AMD_RDNA1,
         {
-            rdna_pipelines,
+            rdna1_pipelines,
         },
         RDNA_DEFAULT_SUBGROUP_SIZE
     },
     {
         vk_device_architecture::AMD_RDNA2,
         {
-            rdna_pipelines,
+            rdna_common_pipelines,
         },
         RDNA_DEFAULT_SUBGROUP_SIZE
     },
@@ -1550,14 +1556,21 @@ static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_dev
     for (const auto &config : gpu_pipeline_configs) {
         if (config.arch == arch) {
             auto pipIt = config.pipelines.find(pipeline_name);
-            if (pipIt != config.pipelines.end() && pipIt->second != 0) {
+            if (pipIt != config.pipelines.end()) {
                 return pipIt->second;
             }
+            std::vector<std::pair<std::string, uint32_t>> sorted_pipelines(config.pipelines.begin(), config.pipelines.end());
+            std::sort(sorted_pipelines.begin(), sorted_pipelines.end(),
+                      [](const auto &a, const auto &b) { return a.first.size() > b.first.size(); });
+            for (const auto &entry : sorted_pipelines) {
+                if (pipeline_name.find(entry.first) != std::string::npos) {
+                    return entry.second;
+                }
+            }
             return config.default_subgroup_size;
         }
     }
-    // If no matching configuration is found, return 0.
-    return 0;
+    return 0; // If no matching configuration is found
 }
 
 static void ggml_vk_load_shaders(vk_device& device) {

From bf7352e1b7f0cde387c08cb014323fd9f1b95241 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Sun, 16 Mar 2025 16:38:14 +0100
Subject: [PATCH 09/10] vulkan: adjusted subgroup size map

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index a6c13ff2164b3..2bc651230578e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1520,17 +1520,18 @@ struct GpuPipelineConfig {
     uint32_t default_subgroup_size = 0;
 };
 
-// Common pipeline configuration for RDNA GPUs.
-static const std::unordered_map<std::string, uint32_t> rdna_common_pipelines = {
+// Pipeline configuration for RDNA1 GPUs.
+static const std::unordered_map<std::string, uint32_t> rdna1_pipelines = {
     {"soft_max", 64}, {"im2col", 64},
-};
-
-// RDNA1 pipeline configuration.
-static std::unordered_map<std::string, uint32_t> rdna1_pipelines = rdna_common_pipelines;
-static const bool rdna1_initialized = (rdna1_pipelines.insert({
     {"argmax", 64}, {"mul_mat_vec", 64},
     {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32}
-}), true);
+};
+
+// Pipeline configuration for RDNA2 GPUs.
+static const std::unordered_map<std::string, uint32_t> rdna2_pipelines = {
+    {"soft_max", 64}, {"im2col", 64},
+    {"argmax", 64},
+};
 
 static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32;
 
@@ -1546,7 +1547,7 @@ static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
     {
         vk_device_architecture::AMD_RDNA2,
         {
-            rdna_common_pipelines,
+            rdna2_pipelines,
         },
         RDNA_DEFAULT_SUBGROUP_SIZE
     },

From d43537ad736ee0cd5a12cc15fa88afa12a343251 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Sun, 16 Mar 2025 16:41:28 +0100
Subject: [PATCH 10/10] vulkan: fixed RDNA2 subgroup map

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 2bc651230578e..3133977562c85 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1530,7 +1530,6 @@ static const std::unordered_map<std::string, uint32_t> rdna1_pipelines = {
 // Pipeline configuration for RDNA2 GPUs.
 static const std::unordered_map<std::string, uint32_t> rdna2_pipelines = {
     {"soft_max", 64}, {"im2col", 64},
-    {"argmax", 64},
 };
 
 static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32;