@@ -1845,48 +1845,54 @@ static void ggml_vk_load_shaders(vk_device& device) {
18451845    }
18461846
18471847    //  mul mat vec
1848-     //  computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
1848+ 
1849+     //  AMD GCN and Intel graphics cards perform best when the number of rows per shader is doubled
1850+     uint32_t  rm = 1 ;
1851+     if  ((device->vendor_id  == VK_VENDOR_ID_AMD && device->subgroup_min_size  == 64  && device->subgroup_max_size  == 64 ) || device->vendor_id  == VK_VENDOR_ID_INTEL)
1852+         rm = 2 ;
1853+ 
1854+     //  computing additional rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
18491855    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_F32 ], " mul_mat_vec_f32_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 );
18501856    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_F16 ], " mul_mat_vec_f16_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 );
1851-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q4_0], " mul_mat_vec_q4_0_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1852-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q4_1], " mul_mat_vec_q4_1_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1853-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q5_0], " mul_mat_vec_q5_0_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1854-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q5_1], " mul_mat_vec_q5_1_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1855-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q8_0], " mul_mat_vec_q8_0_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device->subgroup_size , 1 }, 1 , true );
1857+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q4_0], " mul_mat_vec_q4_0_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1858+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q4_1], " mul_mat_vec_q4_1_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1859+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q5_0], " mul_mat_vec_q5_0_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1860+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q5_1], " mul_mat_vec_q5_1_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1861+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q8_0], " mul_mat_vec_q8_0_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 *rm , 1 , 1 }, {device->subgroup_size , 1 *rm }, 1 , true );
18561862    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q2_K], " mul_mat_vec_q2_k_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18571863    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q3_K], " mul_mat_vec_q3_k_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18581864    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q4_K], " mul_mat_vec_q4_k_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18591865    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q5_K], " mul_mat_vec_q5_k_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18601866    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_Q6_K], " mul_mat_vec_q6_k_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
1861-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_IQ4_NL], " mul_mat_vec_iq4_nl_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1867+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f32_f32 [GGML_TYPE_IQ4_NL], " mul_mat_vec_iq4_nl_f32_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
18621868
18631869    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_F32 ], " mul_mat_vec_f32_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 );
18641870    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_F16 ], " mul_mat_vec_f16_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 );
1865-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q4_0], " mul_mat_vec_q4_0_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1866-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q4_1], " mul_mat_vec_q4_1_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1867-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q5_0], " mul_mat_vec_q5_0_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1868-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q5_1], " mul_mat_vec_q5_1_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1869-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q8_0], " mul_mat_vec_q8_0_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {device->subgroup_size , 1 }, 1 , true );
1871+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q4_0], " mul_mat_vec_q4_0_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1872+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q4_1], " mul_mat_vec_q4_1_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1873+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q5_0], " mul_mat_vec_q5_0_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1874+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q5_1], " mul_mat_vec_q5_1_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1875+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q8_0], " mul_mat_vec_q8_0_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 *rm , 1 , 1 }, {device->subgroup_size , 1 *rm }, 1 , true );
18701876    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q2_K], " mul_mat_vec_q2_k_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18711877    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q3_K], " mul_mat_vec_q3_k_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18721878    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q4_K], " mul_mat_vec_q4_k_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18731879    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q5_K], " mul_mat_vec_q5_k_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18741880    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_Q6_K], " mul_mat_vec_q6_k_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
1875-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_IQ4_NL], " mul_mat_vec_iq4_nl_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 , 1 , 1 }, {device->subgroup_size }, 1 , true );
1881+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_f16_f32 [GGML_TYPE_IQ4_NL], " mul_mat_vec_iq4_nl_f16_f32" " main" 3 , sizeof (vk_mat_vec_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size ,  2 *rm }, 1 , true );
18761882
18771883    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_F32 ], " mul_mat_vec_id_f32_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 );
18781884    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_F16 ], " mul_mat_vec_id_f16_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 );
1879-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q4_0], " mul_mat_vec_id_q4_0_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1880-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q4_1], " mul_mat_vec_id_q4_1_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1881-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q5_0], " mul_mat_vec_id_q5_0_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1882-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q5_1], " mul_mat_vec_id_q5_1_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1883-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q8_0], " mul_mat_vec_id_q8_0_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {device->subgroup_size , 1 }, 1 , true );
1885+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q4_0], " mul_mat_vec_id_q4_0_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1886+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q4_1], " mul_mat_vec_id_q4_1_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1887+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q5_0], " mul_mat_vec_id_q5_0_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1888+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q5_1], " mul_mat_vec_id_q5_1_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
1889+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q8_0], " mul_mat_vec_id_q8_0_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {1 *rm , 1 , 1 }, {device->subgroup_size , 1 *rm }, 1 , true );
18841890    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q2_K], " mul_mat_vec_id_q2_k_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18851891    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q3_K], " mul_mat_vec_id_q3_k_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18861892    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q4_K], " mul_mat_vec_id_q4_k_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18871893    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q5_K], " mul_mat_vec_id_q5_k_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
18881894    ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_Q6_K], " mul_mat_vec_id_q6_k_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {1 , 1 , 1 }, {subgroup_size_16}, 1 , true );
1889-     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_IQ4_NL], " mul_mat_vec_id_iq4_nl_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 , 1 , 1 }, {device->subgroup_size , 2 }, 1 , true );
1895+     ggml_vk_create_pipeline (device, device->pipeline_dequant_mul_mat_vec_id_f32 [GGML_TYPE_IQ4_NL], " mul_mat_vec_id_iq4_nl_f32" " main" 4 , sizeof (vk_mat_vec_id_push_constants), {2 *rm , 1 , 1 }, {device->subgroup_size , 2 *rm }, 1 , true );
18901896
18911897    //  dequant shaders
18921898    ggml_vk_create_pipeline (device, device->pipeline_dequant [GGML_TYPE_F32 ], " f32_to_f16" " main" 2 , 5  * sizeof (uint32_t ), {256  * 16 , 1 , 1 }, {}, 1 );
@@ -2243,13 +2249,16 @@ static vk_device ggml_vk_get_device(size_t idx) {
22432249
22442250        device->pipeline_robustness  = pl_robustness_features.pipelineRobustness ;
22452251
2252+         if  (device->subgroup_size_control ) {
2253+             device->subgroup_min_size  = subgroup_size_control_props.minSubgroupSize ;
2254+             device->subgroup_max_size  = subgroup_size_control_props.maxSubgroupSize ;
2255+         }
2256+ 
22462257        device->subgroup_size_control  = device->subgroup_size_control  &&
22472258                (subgroup_size_control_props.requiredSubgroupSizeStages  & vk::ShaderStageFlagBits::eCompute) &&
22482259                subgroup_size_control_features.subgroupSizeControl ;
22492260
22502261        if  (device->subgroup_size_control ) {
2251-             device->subgroup_min_size  = subgroup_size_control_props.minSubgroupSize ;
2252-             device->subgroup_max_size  = subgroup_size_control_props.maxSubgroupSize ;
22532262            device->subgroup_require_full_support  = subgroup_size_control_features.computeFullSubgroups ;
22542263            device_extensions.push_back (" VK_EXT_subgroup_size_control" 
22552264        }
0 commit comments