@@ -3038,29 +3038,25 @@ static void ggml_vk_load_shaders(vk_device& device) {
30383038 uint32_t conv2d_WG_SIZE = 256;
30393039 uint32_t conv2d_BS_K = 128;
30403040 uint32_t conv2d_BS_CRS = 16;
3041- // Enables subgroup ops for preventing the re-calculation of indices.
3042- uint32_t use_collectives = 0;
3043- // CRS block size should be capped at sugroup size for correctness when shuffle is used.
3044- if(getenv("GGML_VK_USE_COLLECTIVES") != nullptr && device->subgroup_shuffle){
3041+ uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3042+ if(device->subgroup_shuffle){
30453043 use_collectives = 1;
3046- conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
3044+ conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
30473045 }
30483046 uint32_t conv2d_BS_NPQ = 128;
30493047 uint32_t conv2d_TS_K = 8;
30503048 uint32_t conv2d_shmem_req = (conv2d_BS_K*(conv2d_BS_CRS+1) + conv2d_BS_CRS*(conv2d_BS_NPQ+1))*sizeof(float);
30513049 if(device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req){
30523050 conv2d_BS_CRS = 8;
3053- if(getenv("GGML_VK_USE_COLLECTIVES") != nullptr && device->subgroup_shuffle){
3051+ if(device->subgroup_shuffle){
30543052 conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
30553053 }
30563054 }
3057-
3058- std::cerr << " --> BS_CRS=" << conv2d_BS_CRS << " use_collectives=" << use_collectives << std::endl;
30593055
3060- if(device->subgroup_shuffle ){
3056+ if(use_collectives ){
30613057 ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives}, 1, true, true);
30623058 }else{
3063- ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives}, 1, true);
3059+ ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives}, 1, true, false );
30643060 }
30653061
30663062 ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -10820,14 +10816,20 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1082010816 return true;
1082110817 case GGML_OP_CONV_TRANSPOSE_1D:
1082210818 return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
10823- case GGML_OP_CONV_2D:
10824- // Channel-contiguous format is not supported yet.
10825- return (op->src[0]->type == GGML_TYPE_F32 &&
10826- op->src[1]->type == GGML_TYPE_F32 &&
10827- op->type == GGML_TYPE_F32 &&
10828- ggml_is_contiguous(op->src[0]) &&
10829- ggml_is_contiguous(op->src[1]) &&
10830- ggml_is_contiguous(op));
10819+ case GGML_OP_CONV_2D:
10820+ {
10821+ // Op is disabled for Intel
10822+ ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
10823+ const vk_device& device = ggml_vk_get_device(ctx->device);
10824+ bool is_Intel = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_INTEL;
10825+ // Channel-contiguous format is not supported yet.
10826+ return (op->src[0]->type == GGML_TYPE_F32 &&
10827+ op->src[1]->type == GGML_TYPE_F32 &&
10828+ op->type == GGML_TYPE_F32 &&
10829+ ggml_is_contiguous(op->src[0]) &&
10830+ ggml_is_contiguous(op->src[1]) &&
10831+ ggml_is_contiguous(op)) && !is_Intel;
10832+ }
1083110833 default:
1083210834 return false;
1083310835 }
0 commit comments