@@ -150,6 +150,66 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf);
150150
151151static constexpr uint32_t mul_mat_vec_max_cols = 8 ;
152152
153+ enum vk_device_architecture {
154+ OTHER,
155+ AMD_GCN,
156+ AMD_RDNA1,
157+ AMD_RDNA2,
158+ AMD_RDNA3,
159+ };
160+
161+ static vk_device_architecture get_device_architecture (const vk::PhysicalDevice& device) {
162+ vk::PhysicalDeviceProperties props = device.getProperties ();
163+
164+ if (props.vendorID == VK_VENDOR_ID_AMD) {
165+ const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties ();
166+
167+ bool amd_shader_core_properties = false ;
168+ bool integer_dot_product = false ;
169+ bool subgroup_size_control = false ;
170+
171+ for (const auto & properties : ext_props) {
172+ if (strcmp (" VK_AMD_shader_core_properties" , properties.extensionName ) == 0 ) {
173+ amd_shader_core_properties = true ;
174+ } else if (strcmp (" VK_KHR_shader_integer_dot_product" , properties.extensionName ) == 0 ) {
175+ integer_dot_product = true ;
176+ } else if (strcmp (" VK_EXT_subgroup_size_control" , properties.extensionName ) == 0 ) {
177+ subgroup_size_control = true ;
178+ }
179+ }
180+
181+ if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) {
182+ return vk_device_architecture::OTHER;
183+ }
184+
185+ vk::PhysicalDeviceProperties2 props2;
186+ vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd;
187+ vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
188+ vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
189+
190+ props2.pNext = &shader_core_props_amd;
191+ shader_core_props_amd.pNext = &integer_dot_props;
192+ integer_dot_props.pNext = &subgroup_size_control_props;
193+
194+ device.getProperties2 (&props2);
195+
196+ if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64 ) {
197+ return vk_device_architecture::AMD_GCN;
198+ }
199+ if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32 ) {
200+ // RDNA
201+ if (shader_core_props_amd.wavefrontsPerSimd == 20 ) {
202+ return vk_device_architecture::AMD_RDNA1;
203+ }
204+ if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated ) {
205+ return vk_device_architecture::AMD_RDNA3;
206+ }
207+ return vk_device_architecture::AMD_RDNA2;
208+ }
209+ }
210+ return vk_device_architecture::OTHER;
211+ }
212+
153213struct vk_device_struct {
154214 std::mutex mutex;
155215
@@ -162,6 +222,7 @@ struct vk_device_struct {
162222 bool pipeline_robustness;
163223 vk::Device device;
164224 uint32_t vendor_id;
225+ vk_device_architecture architecture;
165226 vk_queue compute_queue;
166227 vk_queue transfer_queue;
167228 bool single_queue;
@@ -1448,6 +1509,73 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
14481509 return supported;
14491510}
14501511
1512+ struct GpuPipelineConfig {
1513+ // GPU architecture identifier.
1514+ // Example: vk_device_architecture::AMD_GCN
1515+ vk_device_architecture arch;
1516+
1517+ // Mapping of pipeline names to their specific subgroup sizes.
1518+ // Example: {"soft_max_f32", 64}
1519+ std::unordered_map<std::string, uint32_t > pipelines;
1520+
1521+ // Default subgroup size for this GPU.
1522+ // Defaults to 0 if not explicitly provided.
1523+ uint32_t default_subgroup_size = 0 ;
1524+ };
1525+
1526+ // Pipeline configuration for RDNA1 GPUs.
1527+ static const std::unordered_map<std::string, uint32_t > rdna1_pipelines = {
1528+ {" soft_max" , 64 }, {" im2col" , 64 },
1529+ {" argmax" , 64 }, {" mul_mat_vec" , 64 },
1530+ {" mul_mat_vec_f16" , 32 }, {" mul_mat_vec_f32_f16" , 32 }
1531+ };
1532+
1533+ // Pipeline configuration for RDNA2 GPUs.
1534+ static const std::unordered_map<std::string, uint32_t > rdna2_pipelines = {
1535+ {" soft_max" , 64 }, {" im2col" , 64 },
1536+ };
1537+
1538+ static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32 ;
1539+
1540+ // Define configurations for different GPUs.
1541+ static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
1542+ {
1543+ vk_device_architecture::AMD_RDNA1,
1544+ {
1545+ rdna1_pipelines,
1546+ },
1547+ RDNA_DEFAULT_SUBGROUP_SIZE
1548+ },
1549+ {
1550+ vk_device_architecture::AMD_RDNA2,
1551+ {
1552+ rdna2_pipelines,
1553+ },
1554+ RDNA_DEFAULT_SUBGROUP_SIZE
1555+ },
1556+ };
1557+
1558+ static uint32_t get_subgroup_size (const std::string &pipeline_name, const vk_device_architecture &arch) {
1559+ for (const auto &config : gpu_pipeline_configs) {
1560+ if (config.arch == arch) {
1561+ auto pipIt = config.pipelines .find (pipeline_name);
1562+ if (pipIt != config.pipelines .end ()) {
1563+ return pipIt->second ;
1564+ }
1565+ std::vector<std::pair<std::string, uint32_t >> sorted_pipelines (config.pipelines .begin (), config.pipelines .end ());
1566+ std::sort (sorted_pipelines.begin (), sorted_pipelines.end (),
1567+ [](const auto &a, const auto &b) { return a.first .size () > b.first .size (); });
1568+ for (const auto &entry : sorted_pipelines) {
1569+ if (pipeline_name.find (entry.first ) != std::string::npos) {
1570+ return entry.second ;
1571+ }
1572+ }
1573+ return config.default_subgroup_size ;
1574+ }
1575+ }
1576+ return 0 ; // If no matching configuration is found
1577+ }
1578+
14511579static void ggml_vk_load_shaders (vk_device& device) {
14521580 VK_LOG_DEBUG (" ggml_vk_load_shaders(" << device->name << " )" );
14531581
@@ -1574,6 +1702,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
15741702 uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t , 3 > wg_denoms, const std::vector<uint32_t >& specialization_constants,
15751703 uint32_t align, bool disable_robustness = false , bool require_full_subgroups = false , uint32_t required_subgroup_size = 0 ) {
15761704
1705+ if (!require_full_subgroups && required_subgroup_size == 0 ) {
1706+ required_subgroup_size = get_subgroup_size (name, device->architecture );
1707+ }
1708+
15771709 if (!pipeline) {
15781710 pipeline = std::make_shared<vk_pipeline_struct>();
15791711 pipeline->name = name;
@@ -2250,7 +2382,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
22502382 device->need_compiles = false ;
22512383}
22522384
2253- static bool ggml_vk_khr_cooperative_matrix_support (const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
2385+ static bool ggml_vk_khr_cooperative_matrix_support (const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch );
22542386
22552387static vk_device ggml_vk_get_device (size_t idx) {
22562388 VK_LOG_DEBUG (" ggml_vk_get_device(" << idx << " )" );
@@ -2279,6 +2411,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
22792411 device->physical_device = physical_devices[dev_num];
22802412 const std::vector<vk::ExtensionProperties> ext_props = device->physical_device .enumerateDeviceExtensionProperties ();
22812413
2414+ device->architecture = get_device_architecture (device->physical_device );
2415+
22822416 const char * GGML_VK_PREFER_HOST_MEMORY = getenv (" GGML_VK_PREFER_HOST_MEMORY" );
22832417 device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr ;
22842418
@@ -2291,7 +2425,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
22912425 bool coopmat2_support = false ;
22922426 device->coopmat_support = false ;
22932427
2294- // Check if maintenance4 is supported
22952428 for (const auto & properties : ext_props) {
22962429 if (strcmp (" VK_KHR_maintenance4" , properties.extensionName ) == 0 ) {
22972430 maintenance4_support = true ;
@@ -2404,7 +2537,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
24042537
24052538 device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
24062539
2407- if (!ggml_vk_khr_cooperative_matrix_support (device->properties , driver_props)) {
2540+ if (!ggml_vk_khr_cooperative_matrix_support (device->properties , driver_props, device-> architecture )) {
24082541 device->coopmat_support = false ;
24092542 }
24102543
@@ -2782,7 +2915,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
27822915 subgroup_props.pNext = &driver_props;
27832916 physical_device.getProperties2 (&props2);
27842917
2785- const size_t subgroup_size = subgroup_props.subgroupSize ;
2918+ vk_device_architecture arch = get_device_architecture (physical_device);
2919+ uint32_t default_subgroup_size = get_subgroup_size (" " , arch);
2920+ const size_t subgroup_size = (default_subgroup_size != 0 ) ? default_subgroup_size : subgroup_props.subgroupSize ;
2921+
27862922 const bool uma = props2.properties .deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
27872923
27882924 bool fp16_storage = false ;
@@ -2808,7 +2944,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
28082944 }
28092945 }
28102946
2811- if (!ggml_vk_khr_cooperative_matrix_support (props2.properties , driver_props)) {
2947+ const vk_device_architecture device_architecture = get_device_architecture (physical_device);
2948+
2949+ if (!ggml_vk_khr_cooperative_matrix_support (props2.properties , driver_props, device_architecture)) {
28122950 coopmat_support = false ;
28132951 }
28142952
@@ -8843,18 +8981,15 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
88438981 UNUSED (instance_extensions);
88448982}
88458983
8846- static bool ggml_vk_khr_cooperative_matrix_support (const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) {
8984+ static bool ggml_vk_khr_cooperative_matrix_support (const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch ) {
88478985 switch (props.vendorID ) {
88488986 case VK_VENDOR_ID_INTEL:
88498987 // Intel drivers don't support coopmat properly yet
88508988 return false ;
88518989 case VK_VENDOR_ID_AMD:
88528990 if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
88538991 // Workaround for AMD proprietary driver reporting support on all GPUs
8854- const std::string name = props.deviceName ;
8855- return name.rfind (" AMD Radeon RX 7" , 0 ) == 0 || name.rfind (" AMD Radeon(TM) RX 7" , 0 ) == 0 || // RDNA 3 consumer GPUs
8856- name.rfind (" AMD Radeon PRO W7" , 0 ) == 0 || name.rfind (" AMD Radeon(TM) PRO W7" , 0 ) == 0 || // RDNA 3 workstation GPUs
8857- name.rfind (" AMD Radeon 7" , 0 ) == 0 || name.rfind (" AMD Radeon(TM) 7" , 0 ) == 0 ; // RDNA 3 APUs
8992+ return arch == vk_device_architecture::AMD_RDNA3;
88588993 }
88598994 return true ;
88608995 default :
0 commit comments