Skip to content
39 changes: 39 additions & 0 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1423,6 +1423,36 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
return supported;
}

// Define a configuration map per GPU.
// Outer key: GPU identifier (e.g. "RX 5700").
// Inner map: key is pipeline name; value is the subgroup size.
static std::unordered_map<std::string, std::unordered_map<std::string, uint32_t>> gpu_pipeline_config = {
{"RX 5700", {
{"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
{"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
{"im2col_f32", 64}, {"im2col_f32_f16", 64},
}}
};

static uint32_t get_subgroup_size(const std::string &pipeline_name, const std::string &device_name) {
std::string foundKey;
for (const auto &entry : gpu_pipeline_config) {
if (device_name.find(entry.first) != std::string::npos) {
foundKey = entry.first;
break;
}
}
if (!foundKey.empty()) {
auto &pipelineMap = gpu_pipeline_config[foundKey];
auto pipIt = pipelineMap.find(pipeline_name);
if (pipIt != pipelineMap.end() && pipIt->second != 0) {
return pipIt->second;
}
}
// If not defined, return 0.
return 0;
}

static void ggml_vk_load_shaders(vk_device& device) {
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");

Expand Down Expand Up @@ -1543,11 +1573,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
}

vk::PhysicalDeviceProperties2 props2;
device->physical_device.getProperties2(&props2);
std::string device_name = props2.properties.deviceName.data();
Copy link
Collaborator

@0cc4m 0cc4m Feb 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is needed anymore (also, the device name is available in device->name, and the properties in device->properties)

Edit: I forgot it's used by the get_subgroup_size function. But just use the device field.

Copy link
Contributor Author

@daniandtheweb daniandtheweb Feb 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I don't set the device name like this the shader executes at wave64 speed instead of wave32 and I get no performance improvement at all.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked and device->name contains just something like Vulkan0, so that's why. But if you need the actual device name, it should be stored in the device struct on init and only accessed here, something like physica_device_name.

But as mentioned in my other comment, we can probably ignore device names for now.


std::vector<std::future<void>> compiles;
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {

required_subgroup_size = get_subgroup_size(name, device_name);
if (required_subgroup_size == 0) {
required_subgroup_size = (device_name.find("RX 5700") != std::string::npos) ? 32 : required_subgroup_size;
}

if (!pipeline) {
pipeline = std::make_shared<vk_pipeline_struct>();
pipeline->name = name;
Expand Down
50 changes: 32 additions & 18 deletions ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,20 @@ void main() {
const uint batch = gl_GlobalInvocationID.z / p.IC;
const uint ic = gl_GlobalInvocationID.z % p.IC;

const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH);
const int oh_s1 = int(oh) * p.s1;
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);

const uint base_linear_idx = gidx * NUM_ITER;

const uint max_ky = ksize / p.OW;

uint current_kx = base_linear_idx / ksize;
const uint rem = base_linear_idx - (current_kx * ksize);
uint current_ky = rem / p.OW;
uint current_ix = rem % p.OW;

A_TYPE values[NUM_ITER];
uint offset_dst[NUM_ITER];
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
Expand All @@ -48,36 +62,36 @@ void main() {

[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {

const uint i = gidx * NUM_ITER + idx;
const uint linear_idx = base_linear_idx + idx;

const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
const uint kx = i / ksize;
const uint kd = kx * ksize;
const uint ky = (i - kd) / p.OW;
const uint ix = i % p.OW;
if (linear_idx >= p.pelements) {
continue;
}

const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
const uint iiw = int(current_ix) * p.s0 + int(current_kx) * p.d0 - p.p0;
const uint iih = oh_s1 + int(current_ky) * p.d1 - p.p1;

offset_dst[idx] =
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
(ic * (p.KW * p.KH) + ky * p.KW + kx);
offset_dst[idx] = dst_base + current_ix * p.CHW + current_ky * p.KW + current_kx;

if (i >= p.pelements) {
continue;
const bool valid = (iih >= 0 && iih < int(p.IH)) && (iiw >= 0 && iiw < int(p.IW));
if (valid) {
values[idx] = data_a[src_base + uint(iih) * p.IW + uint(iiw)];
}

if (iih < p.IH && iiw < p.IW) {
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
values[idx] = data_a[offset_src + iih * p.IW + iiw];
if (++current_ix == p.OW) {
current_ix = 0;
if (++current_ky == max_ky) {
current_ky = 0;
current_kx++;
}
}
}

[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {

const uint i = gidx * NUM_ITER + idx;
const uint linear_idx = base_linear_idx + idx;

if (i >= p.pelements) {
if (linear_idx >= p.pelements) {
continue;
}

Expand Down
Loading