Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,18 @@ static constexpr uint32_t p021_max_gqa_ratio = 8;

enum vk_device_architecture {
OTHER,
AMD_GCN,
AMD_GCN12,
AMD_GCN34,
AMD_GCN5,
AMD_RDNA1,
AMD_RDNA2,
AMD_RDNA3,
};

static bool is_gcn(vk_device_architecture arch) {
return (arch == AMD_GCN12) || (arch == AMD_GCN34) || (arch == AMD_GCN5);
}

static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
vk::PhysicalDeviceProperties props = device.getProperties();

Expand All @@ -209,6 +215,7 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
bool amd_shader_core_properties = false;
bool integer_dot_product = false;
bool subgroup_size_control = false;
bool float16_int8 = false;

for (const auto& properties : ext_props) {
if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) {
Expand All @@ -217,10 +224,12 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
integer_dot_product = true;
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
subgroup_size_control = true;
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
float16_int8 = true;
}
}

if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) {
if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control || !float16_int8) {
return vk_device_architecture::OTHER;
}

Expand All @@ -235,8 +244,23 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&

device.getProperties2(&props2);

vk::PhysicalDeviceFeatures2 features2;
vk::PhysicalDeviceShaderFloat16Int8FeaturesKHR float16_int8_features;

features2.pNext = &float16_int8_features;

device.getFeatures2(&features2);

if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) {
return vk_device_architecture::AMD_GCN;
// GCN
if (shader_core_props_amd.sgprAllocationGranularity == 16) {
if (float16_int8_features.shaderFloat16) {
return vk_device_architecture::AMD_GCN5;
} else {
return vk_device_architecture::AMD_GCN34;
}
}
return vk_device_architecture::AMD_GCN12;
}
if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) {
// RDNA
Expand Down Expand Up @@ -1792,7 +1816,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, 32, 32, 2, 2, 1, 1, subgroup_size_8 };

// chip specific tuning
if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
if (is_gcn(device->architecture) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
}

Expand Down Expand Up @@ -2357,7 +2381,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
uint32_t rm_stdq = 1;
uint32_t rm_kq = 2;
if (device->vendor_id == VK_VENDOR_ID_AMD) {
if (device->architecture == AMD_GCN) {
if (is_gcn(device->architecture)) {
rm_stdq = 2;
rm_kq = 4;
}
Expand Down Expand Up @@ -2960,7 +2984,8 @@ static vk_device ggml_vk_get_device(size_t idx) {

vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);

device->fp16 = device->fp16 && vk12_features.shaderFloat16;
// GCN 3 and 4 chips support FP16 at regular speed, but the drivers don't indicate it
device->fp16 = device->fp16 && (vk12_features.shaderFloat16 || (device->architecture == AMD_GCN34));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's against the spec to enable this if support is not indicated with the shaderFloat16 feature, which it is not for these GPUs. Did you try running this with validation layers enabled? I'm relatively sure this should throw validation issues.


device->pipeline_robustness = pl_robustness_features.pipelineRobustness;

Expand Down Expand Up @@ -3362,7 +3387,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {

vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);

fp16 = fp16 && vk12_features.shaderFloat16;
fp16 = fp16 && (vk12_features.shaderFloat16 || (device_architecture == AMD_GCN34));

uint32_t default_subgroup_size = get_subgroup_size("", device_architecture);
const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
Expand Down
Loading