Skip to content

Commit 2610381

Browse files
committed
enable fp16 for gcn 3 and 4
1 parent 814f795 commit 2610381

File tree

1 file changed

+30
-7
lines changed

1 file changed

+30
-7
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -194,12 +194,21 @@ static constexpr uint32_t p021_max_gqa_ratio = 8;
194194

195195
enum vk_device_architecture {
196196
OTHER,
197-
AMD_GCN,
197+
AMD_GCN12,
198+
AMD_GCN34,
199+
AMD_GCN5,
198200
AMD_RDNA1,
199201
AMD_RDNA2,
200202
AMD_RDNA3,
201203
};
202204

205+
static bool is_gcn(vk_device_architecture arch) {
206+
if ((arch == AMD_GCN12) || (arch == AMD_GCN34) || (arch == AMD_GCN5))
207+
return true;
208+
else
209+
return false;
210+
}
211+
203212
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
204213
vk::PhysicalDeviceProperties props = device.getProperties();
205214

@@ -209,6 +218,7 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
209218
bool amd_shader_core_properties = false;
210219
bool integer_dot_product = false;
211220
bool subgroup_size_control = false;
221+
bool float16_int8 = false;
212222

213223
for (const auto& properties : ext_props) {
214224
if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) {
@@ -217,26 +227,38 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
217227
integer_dot_product = true;
218228
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
219229
subgroup_size_control = true;
230+
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
231+
float16_int8 = true;
220232
}
221233
}
222234

223-
if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) {
235+
if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control || !float16_int8) {
224236
return vk_device_architecture::OTHER;
225237
}
226238

227239
vk::PhysicalDeviceProperties2 props2;
228240
vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd;
229241
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
230242
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
243+
vk::PhysicalDeviceShaderFloat16Int8FeaturesKHR float16_int8_props;
231244

232245
props2.pNext = &shader_core_props_amd;
233246
shader_core_props_amd.pNext = &integer_dot_props;
234247
integer_dot_props.pNext = &subgroup_size_control_props;
248+
subgroup_size_control_props.pNext = &float16_int8_props;
235249

236250
device.getProperties2(&props2);
237251

238252
if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) {
239-
return vk_device_architecture::AMD_GCN;
253+
// GCN
254+
if (shader_core_props_amd.sgprAllocationGranularity == 16) {
255+
if (float16_int8_props.shaderFloat16) {
256+
return vk_device_architecture::AMD_GCN5;
257+
} else {
258+
return vk_device_architecture::AMD_GCN34;
259+
}
260+
}
261+
return vk_device_architecture::AMD_GCN12;
240262
}
241263
if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) {
242264
// RDNA
@@ -1792,7 +1814,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
17921814
s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, 32, 32, 2, 2, 1, 1, subgroup_size_8 };
17931815

17941816
// chip specific tuning
1795-
if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
1817+
if (is_gcn(device->architecture) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
17961818
m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
17971819
}
17981820

@@ -2357,7 +2379,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
23572379
uint32_t rm_stdq = 1;
23582380
uint32_t rm_kq = 2;
23592381
if (device->vendor_id == VK_VENDOR_ID_AMD) {
2360-
if (device->architecture == AMD_GCN) {
2382+
if (is_gcn(device->architecture)) {
23612383
rm_stdq = 2;
23622384
rm_kq = 4;
23632385
}
@@ -2960,7 +2982,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
29602982

29612983
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
29622984

2963-
device->fp16 = device->fp16 && vk12_features.shaderFloat16;
2985+
// GCN 3 and 4 chips support FP16 at regular speed, but the drivers don't indicate it
2986+
device->fp16 = device->fp16 && (vk12_features.shaderFloat16 || (device->architecture == AMD_GCN34));
29642987

29652988
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
29662989

@@ -3362,7 +3385,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
33623385

33633386
vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
33643387

3365-
fp16 = fp16 && vk12_features.shaderFloat16;
3388+
fp16 = fp16 && (vk12_features.shaderFloat16 || (device_architecture == AMD_GCN34));
33663389

33673390
uint32_t default_subgroup_size = get_subgroup_size("", device_architecture);
33683391
const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;

0 commit comments

Comments
 (0)