@@ -194,12 +194,21 @@ static constexpr uint32_t p021_max_gqa_ratio = 8;
194194
195195enum vk_device_architecture {
196196 OTHER,
197- AMD_GCN,
197+ AMD_GCN12,
198+ AMD_GCN34,
199+ AMD_GCN5,
198200 AMD_RDNA1,
199201 AMD_RDNA2,
200202 AMD_RDNA3,
201203};
202204
205+ static bool is_gcn(vk_device_architecture arch) {
206+ if ((arch == AMD_GCN12) || (arch == AMD_GCN34) || (arch == AMD_GCN5))
207+ return true;
208+ else
209+ return false;
210+ }
211+
203212static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
204213 vk::PhysicalDeviceProperties props = device.getProperties();
205214
@@ -209,6 +218,7 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
209218 bool amd_shader_core_properties = false;
210219 bool integer_dot_product = false;
211220 bool subgroup_size_control = false;
221+ bool float16_int8 = false;
212222
213223 for (const auto& properties : ext_props) {
214224 if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) {
@@ -217,26 +227,38 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
217227 integer_dot_product = true;
218228 } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
219229 subgroup_size_control = true;
230+ } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
231+ float16_int8 = true;
220232 }
221233 }
222234
223- if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) {
235+ if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control || !float16_int8 ) {
224236 return vk_device_architecture::OTHER;
225237 }
226238
227239 vk::PhysicalDeviceProperties2 props2;
228240 vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd;
229241 vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
230242 vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
243+ vk::PhysicalDeviceShaderFloat16Int8FeaturesKHR float16_int8_props;
231244
232245 props2.pNext = &shader_core_props_amd;
233246 shader_core_props_amd.pNext = &integer_dot_props;
234247 integer_dot_props.pNext = &subgroup_size_control_props;
248+ subgroup_size_control_props.pNext = &float16_int8_props;
235249
236250 device.getProperties2(&props2);
237251
238252 if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) {
239- return vk_device_architecture::AMD_GCN;
253+ // GCN
254+ if (shader_core_props_amd.sgprAllocationGranularity == 16) {
255+ if (float16_int8_props.shaderFloat16) {
256+ return vk_device_architecture::AMD_GCN5;
257+ } else {
258+ return vk_device_architecture::AMD_GCN34;
259+ }
260+ }
261+ return vk_device_architecture::AMD_GCN12;
240262 }
241263 if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) {
242264 // RDNA
@@ -1792,7 +1814,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
17921814 s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, 32, 32, 2, 2, 1, 1, subgroup_size_8 };
17931815
17941816 // chip specific tuning
1795- if ((device->architecture == AMD_GCN ) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
1817+ if (is_gcn (device->architecture) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
17961818 m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
17971819 }
17981820
@@ -2357,7 +2379,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
23572379 uint32_t rm_stdq = 1;
23582380 uint32_t rm_kq = 2;
23592381 if (device->vendor_id == VK_VENDOR_ID_AMD) {
2360- if (device->architecture == AMD_GCN ) {
2382+ if (is_gcn( device->architecture) ) {
23612383 rm_stdq = 2;
23622384 rm_kq = 4;
23632385 }
@@ -2960,7 +2982,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
29602982
29612983 vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
29622984
2963- device->fp16 = device->fp16 && vk12_features.shaderFloat16;
2985+ // GCN 3 and 4 chips support FP16 at regular speed, but the drivers don't indicate it
2986+ device->fp16 = device->fp16 && (vk12_features.shaderFloat16 || (device->architecture == AMD_GCN34));
29642987
29652988 device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
29662989
@@ -3362,7 +3385,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
33623385
33633386 vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
33643387
3365- fp16 = fp16 && vk12_features.shaderFloat16;
3388+ fp16 = fp16 && ( vk12_features.shaderFloat16 || (device_architecture == AMD_GCN34)) ;
33663389
33673390 uint32_t default_subgroup_size = get_subgroup_size("", device_architecture);
33683391 const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
0 commit comments