@@ -1231,6 +1231,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
12311231
12321232    std::cerr << " ggml_vulkan: Compiling shaders"  ;
12331233
1234+     //  some shaders require the subgroup size to be 16 or larger
1235+     const  uint32_t  subgroup_size_16 = std::max (device->subgroup_size , 16u );
1236+ 
12341237    //  mulmat
12351238    std::vector<uint32_t > l_warptile, m_warptile, s_warptile,
12361239                          l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
@@ -1240,11 +1243,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
12401243
12411244    l_warptile = { 128 , 128 , 128 , 16 , device->subgroup_size  * 2 , 64 , 2 , 4 , 4 , device->subgroup_size  };
12421245    m_warptile = { 128 ,  64 ,  64 , 16 , device->subgroup_size , 32 , 2 , 4 , 2 , device->subgroup_size  };
1243-     s_warptile = { std::max (device-> subgroup_size ,  16u ) ,  32 ,  32 , 16 , 32 , 32 , 2 , 2 , 2 , device->subgroup_size  };
1246+     s_warptile = { subgroup_size_16 ,  32 ,  32 , 16 , 32 , 32 , 2 , 2 , 2 , device->subgroup_size  };
12441247
12451248    l_warptile_mmq = { 128 , 128 , 128 , 32 , device->subgroup_size  * 2 , 64 , 2 , 4 , 4 , device->subgroup_size  };
12461249    m_warptile_mmq = { 128 ,  64 ,  64 , 32 , device->subgroup_size , 32 , 2 , 4 , 2 , device->subgroup_size  };
1247-     s_warptile_mmq = { std::max (device-> subgroup_size ,  16u ) ,  32 ,  32 , 32 , 32 , 32 , 2 , 2 , 2 , device->subgroup_size  };
1250+     s_warptile_mmq = { subgroup_size_16 ,  32 ,  32 , 32 , 32 , 32 , 2 , 2 , 2 , device->subgroup_size  };
12481251
12491252    l_mmq_wg_denoms = l_wg_denoms = {128 , 128 , 1  };
12501253    m_mmq_wg_denoms = m_wg_denoms = { 64 ,  64 , 1  };
@@ -1431,7 +1434,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
14311434    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q3_K], " mul_mat_vec_q3_k_f32_f32"  , mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device->subgroup_size }, 1 , true );
14321435    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q4_K], " mul_mat_vec_q4_k_f32_f32"  , mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device->subgroup_size }, 1 , true );
14331436    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q5_K], " mul_mat_vec_q5_k_f32_f32"  , mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device->subgroup_size }, 1 , true );
1434-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q6_K], " mul_mat_vec_q6_k_f32_f32"  , mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device-> subgroup_size }, 1 , true );
1437+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q6_K], " mul_mat_vec_q6_k_f32_f32"  , mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16 }, 1 , true );
14351438    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_IQ4_NL], " mul_mat_vec_iq4_nl_f32_f32"  , mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
14361439
14371440    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_F32 ], " mul_mat_vec_f32_f16_f32"  ,  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  " main"  , 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 );
@@ -1445,7 +1448,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
14451448    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q3_K], " mul_mat_vec_q3_k_f16_f32"  , mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device->subgroup_size }, 1 , true );
14461449    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q4_K], " mul_mat_vec_q4_k_f16_f32"  , mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device->subgroup_size }, 1 , true );
14471450    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q5_K], " mul_mat_vec_q5_k_f16_f32"  , mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device->subgroup_size }, 1 , true );
1448-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q6_K], " mul_mat_vec_q6_k_f16_f32"  , mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device-> subgroup_size }, 1 , true );
1451+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q6_K], " mul_mat_vec_q6_k_f16_f32"  , mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16 }, 1 , true );
14491452    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_IQ4_NL], " mul_mat_vec_iq4_nl_f16_f32"  , mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, " main"  , 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size }, 1 , true );
14501453
14511454    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_F32 ], " mul_mat_vec_id_f32_f32"  ,  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  " main"  , 4 , sizeof (vk_mat_vec_id_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 );
@@ -1459,7 +1462,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
14591462    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q3_K], " mul_mat_vec_id_q3_k_f32"  , mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, " main"  , 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {device->subgroup_size }, 1 , true );
14601463    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q4_K], " mul_mat_vec_id_q4_k_f32"  , mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, " main"  , 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {device->subgroup_size }, 1 , true );
14611464    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q5_K], " mul_mat_vec_id_q5_k_f32"  , mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, " main"  , 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {device->subgroup_size }, 1 , true );
1462-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q6_K], " mul_mat_vec_id_q6_k_f32"  , mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, " main"  , 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {device-> subgroup_size }, 1 , true );
1465+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q6_K], " mul_mat_vec_id_q6_k_f32"  , mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, " main"  , 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {subgroup_size_16 }, 1 , true );
14631466    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_IQ4_NL], " mul_mat_vec_id_iq4_nl_f32"  , mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, " main"  , 4 , sizeof (vk_mat_vec_id_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
14641467
14651468    //  dequant shaders
0 commit comments