Skip to content
49 changes: 49 additions & 0 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1423,6 +1423,49 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
return supported;
}

struct GpuPipelineConfig {
// List of all aliases for a given GPU.
// For example, this can include names like "NAVI10", "RX 5700", etc.
std::vector<std::string> device_names;

// Mapping of pipeline names to their specific subgroup sizes.
// Example: {"soft_max_f32", 64}.
std::unordered_map<std::string, uint32_t> pipelines;

// Default subgroup size for this GPU.
// Defaults to 0 if not explicitly provided.
uint32_t default_subgroup_size = 0;
};

// Define configurations for different GPUs.
static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
{
{"NAVI10", "NAVI14", "RX 5700", "RX 5600", "RX 5500"},
{
{"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
{"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
{"im2col_f32", 64}, {"im2col_f32_f16", 64},
},
32
},
};

static uint32_t get_subgroup_size(const std::string &pipeline_name, const std::string &device_name) {
for (const auto &config : gpu_pipeline_configs) {
for (const auto &alias : config.device_names) {
if (device_name.find(alias) != std::string::npos) {
auto pipIt = config.pipelines.find(pipeline_name);
if (pipIt != config.pipelines.end() && pipIt->second != 0) {
return pipIt->second;
}
return config.default_subgroup_size;
}
}
}
// If no matching configuration is found, return 0.
return 0;
}

static void ggml_vk_load_shaders(vk_device& device) {
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");

Expand Down Expand Up @@ -1543,11 +1586,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
}

vk::PhysicalDeviceProperties2 props2;
device->physical_device.getProperties2(&props2);
std::string device_name = props2.properties.deviceName.data();
Copy link
Collaborator

@0cc4m 0cc4m Feb 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is needed anymore (also, the device name is available in device->name, and the properties in device->properties)

Edit: I forgot it's used by the get_subgroup_size function. But just use the device field.

Copy link
Contributor Author

@daniandtheweb daniandtheweb Feb 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I don't set the device name like this the shader executes at wave64 speed instead of wave32 and I get no performance improvement at all.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked and device->name contains just something like Vulkan0, so that's why. But if you need the actual device name, it should be stored in the device struct on init and only accessed here, something like physica_device_name.

But as mentioned in my other comment, we can probably ignore device names for now.


std::vector<std::future<void>> compiles;
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {

required_subgroup_size = get_subgroup_size(name, device_name);

if (!pipeline) {
pipeline = std::make_shared<vk_pipeline_struct>();
pipeline->name = name;
Expand Down
49 changes: 31 additions & 18 deletions ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,20 @@ void main() {
const uint batch = gl_GlobalInvocationID.z / p.IC;
const uint ic = gl_GlobalInvocationID.z % p.IC;

const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH);
const int oh_s1 = int(oh) * p.s1;
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);

const uint base_linear_idx = gidx * NUM_ITER;

const uint max_ky = ksize / p.OW;

uint current_kx = base_linear_idx / ksize;
const uint rem = base_linear_idx - (current_kx * ksize);
uint current_ky = rem / p.OW;
uint current_ix = rem % p.OW;

A_TYPE values[NUM_ITER];
uint offset_dst[NUM_ITER];
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
Expand All @@ -48,36 +62,35 @@ void main() {

[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {

const uint i = gidx * NUM_ITER + idx;
const uint linear_idx = base_linear_idx + idx;

const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
const uint kx = i / ksize;
const uint kd = kx * ksize;
const uint ky = (i - kd) / p.OW;
const uint ix = i % p.OW;
if (linear_idx >= p.pelements) {
continue;
}

const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
const uint iiw = current_ix * p.s0 + current_kx * p.d0 - p.p0;
const uint iih = oh_s1 + current_ky * p.d1 - p.p1;

offset_dst[idx] =
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
(ic * (p.KW * p.KH) + ky * p.KW + kx);
offset_dst[idx] = dst_base + current_ix * p.CHW + current_ky * p.KW + current_kx;

if (i >= p.pelements) {
continue;
if ((iih < p.IH) && (iiw < p.IW)) {
values[idx] = data_a[src_base + iih * p.IW + iiw];
}

if (iih < p.IH && iiw < p.IW) {
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
values[idx] = data_a[offset_src + iih * p.IW + iiw];
if (++current_ix == p.OW) {
current_ix = 0;
if (++current_ky == max_ky) {
current_ky = 0;
current_kx++;
}
}
}

[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {

const uint i = gidx * NUM_ITER + idx;
const uint linear_idx = base_linear_idx + idx;

if (i >= p.pelements) {
if (linear_idx >= p.pelements) {
continue;
}

Expand Down
Loading