@@ -222,6 +222,7 @@ enum vk_device_architecture {
222222 AMD_RDNA2,
223223 AMD_RDNA3,
224224 INTEL_XE2,
225+ NVIDIA_PRE_TURING,
225226};
226227
227228// HSK x HSV
@@ -315,10 +316,33 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
315316 // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
316317 return vk_device_architecture::INTEL_XE2;
317318 }
319+ } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
320+ const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
321+
322+ bool cooperative_matrix = false;
323+
324+ // Detect "pre-turing" based on lack of coopmat support.
325+ for (const auto& properties : ext_props) {
326+ if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
327+ cooperative_matrix = true;
328+ break;
329+ }
330+ }
331+
332+ if (!cooperative_matrix) {
333+ return vk_device_architecture::NVIDIA_PRE_TURING;
334+ }
318335 }
319336 return vk_device_architecture::OTHER;
320337}
321338
339+ enum vk_conv_shapes {
340+ CONV_SHAPE_128x128,
341+ CONV_SHAPE_64x32,
342+ CONV_SHAPE_32x256,
343+ CONV_SHAPE_COUNT,
344+ };
345+
322346struct vk_device_struct {
323347 std::recursive_mutex mutex;
324348
@@ -483,8 +507,8 @@ struct vk_device_struct {
483507 vk_pipeline pipeline_rwkv_wkv6_f32;
484508 vk_pipeline pipeline_rwkv_wkv7_f32;
485509 vk_pipeline pipeline_opt_step_adamw_f32;
486- vk_pipeline pipeline_conv2d_f32;
487- vk_pipeline pipeline_conv2d_f16_f32;
510+ vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT] ;
511+ vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT] ;
488512 vk_pipeline pipeline_conv2d_dw_whcn_f32;
489513 vk_pipeline pipeline_conv2d_dw_cwhn_f32;
490514
@@ -908,8 +932,22 @@ struct vk_op_conv2d_push_constants {
908932 uint32_t nb1;
909933 uint32_t nb2;
910934 uint32_t nb3;
935+
936+ // init_fastdiv_values constants for dividing by KW, KW*KH, OW, OW*OH
937+ uint32_t KWmp; uint32_t KWL;
938+ uint32_t KWKHmp; uint32_t KWKHL;
939+ uint32_t OWmp; uint32_t OWL;
940+ uint32_t OWOHmp; uint32_t OWOHL;
911941};
912942
943+ template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
944+ // Compute magic values to divide by KW, KW*KH, OW, OW*OH
945+ init_fastdiv_values(p.KW, p.KWmp, p.KWL);
946+ init_fastdiv_values(p.KW*p.KH, p.KWKHmp, p.KWKHL);
947+ init_fastdiv_values(p.OW, p.OWmp, p.OWL);
948+ init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
949+ }
950+
913951struct vk_op_conv2d_dw_push_constants {
914952 uint32_t ne;
915953 uint32_t batches;
@@ -3048,48 +3086,89 @@ static void ggml_vk_load_shaders(vk_device& device) {
30483086 ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
30493087
30503088 // conv2d
3051- uint32_t conv2d_WG_SIZE = 256;
3052- uint32_t conv2d_BS_K = 128;
3053- uint32_t conv2d_BS_CRS = 16;
3054- uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3055- if (device->subgroup_shuffle &&
3056- device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316
3057- use_collectives = 1;
3058- conv2d_BS_CRS = std::min(
3059- device->subgroup_size,
3060- conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
3061- }
3062- uint32_t conv2d_BS_NPQ = 128;
3063- uint32_t conv2d_TS_K = 8;
3064- uint32_t conv2d_shmem_req =
3065- (conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
3066- if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
3067- conv2d_BS_CRS = 8;
3068- if (use_collectives) {
3069- conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
3070- }
3071- }
3072-
3073- if (use_collectives) {
3074- ggml_vk_create_pipeline(
3075- device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3076- sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3077- { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3078- ggml_vk_create_pipeline(
3079- device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3080- sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3081- { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3082- } else {
3083- ggml_vk_create_pipeline(
3084- device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3085- sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3086- { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3087- false);
3088- ggml_vk_create_pipeline(
3089- device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3090- sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3091- { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3092- false);
3089+ for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
3090+ uint32_t conv2d_WG_SIZE = 256;
3091+ uint32_t conv2d_BS_K = 128;
3092+ uint32_t conv2d_BS_CRS = 16;
3093+ uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3094+ uint32_t conv2d_BS_NPQ = 128;
3095+ uint32_t conv2d_TS_K = 8;
3096+ uint32_t conv2d_SHMEM_PAD = 4;
3097+ bool conv2d_UNROLL = true;
3098+
3099+ if (device->vendor_id == VK_VENDOR_ID_INTEL) {
3100+ conv2d_SHMEM_PAD = 0;
3101+ conv2d_UNROLL = false;
3102+ } else if (device->vendor_id == VK_VENDOR_ID_AMD) {
3103+ conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4;
3104+ }
3105+
3106+ switch (s) {
3107+ default:
3108+ case CONV_SHAPE_128x128:
3109+ conv2d_BS_K = 128;
3110+ conv2d_BS_NPQ = 128;
3111+ conv2d_BS_CRS = 16;
3112+ if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != vk_device_architecture::AMD_GCN) {
3113+ conv2d_UNROLL = false;
3114+ }
3115+ break;
3116+ case CONV_SHAPE_64x32:
3117+ conv2d_BS_K = 64;
3118+ conv2d_BS_NPQ = 32;
3119+ conv2d_BS_CRS = 32;
3120+ conv2d_TS_K = 4;
3121+ break;
3122+ case CONV_SHAPE_32x256:
3123+ conv2d_BS_K = 32;
3124+ conv2d_BS_NPQ = 256;
3125+ conv2d_BS_CRS = 16;
3126+ break;
3127+ }
3128+
3129+ // Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math.
3130+ bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA ||
3131+ device->architecture == vk_device_architecture::NVIDIA_PRE_TURING;
3132+ bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD ||
3133+ device->architecture == vk_device_architecture::AMD_GCN;
3134+
3135+ if (device->subgroup_shuffle &&
3136+ device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316.
3137+ allow_collectives_nv &&
3138+ allow_collectives_amd) {
3139+ use_collectives = 1;
3140+ conv2d_BS_CRS = std::min(
3141+ device->subgroup_size,
3142+ conv2d_BS_CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
3143+ }
3144+
3145+ uint32_t conv2d_shmem_req =
3146+ (conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
3147+ if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
3148+ conv2d_BS_CRS = 8;
3149+ if (use_collectives) {
3150+ conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
3151+ }
3152+ }
3153+
3154+ std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
3155+ std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
3156+
3157+ if (conv2d_UNROLL) {
3158+ ggml_vk_create_pipeline(
3159+ device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3,
3160+ sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3161+ ggml_vk_create_pipeline(
3162+ device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_unroll_len, conv2d_f16_f32_unroll_data, "main", 3,
3163+ sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3164+ } else {
3165+ ggml_vk_create_pipeline(
3166+ device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3167+ sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3168+ ggml_vk_create_pipeline(
3169+ device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3170+ sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3171+ }
30933172 }
30943173
30953174 ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -6641,6 +6720,34 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
66416720 }
66426721}
66436722
6723+ static std::array<uint32_t, 3> ggml_vk_get_conv_elements(const ggml_tensor *dst) {
6724+ const ggml_tensor *src0 = dst->src[0];
6725+ const ggml_tensor *src1 = dst->src[1];
6726+
6727+ // src0 - kernel: [KW, KH, Cin, Cout]
6728+ // src1 - input: [W, H, Cin, N]
6729+ // dst - result: [OW, OH, Cout, N]
6730+
6731+ // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
6732+ auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
6733+ return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
6734+ };
6735+ // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
6736+ int64_t W = src1->ne[0];
6737+ int64_t H = src1->ne[1];
6738+ int64_t KW = src0->ne[0];
6739+ int64_t KH = src0->ne[1];
6740+ int64_t Cout = src0->ne[3];
6741+ int64_t N = src1->ne[3];
6742+ int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
6743+ int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
6744+ int64_t NPQ = N * OW * OH;
6745+
6746+ // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
6747+ std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
6748+ return elements;
6749+ }
6750+
66446751static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
66456752 switch (op) {
66466753 case GGML_OP_GET_ROWS:
@@ -6970,10 +7077,30 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
69707077 case GGML_OP_CONV_2D:
69717078 if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
69727079 ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
7080+ auto elements = ggml_vk_get_conv_elements(dst);
7081+ vk_conv_shapes shape;
7082+
7083+ uint32_t tiles[CONV_SHAPE_COUNT];
7084+ for (uint32_t i = 0; i < CONV_SHAPE_COUNT; ++i) {
7085+ tiles[i] = CEIL_DIV(elements[0], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[0]) * CEIL_DIV(elements[1], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[1]);
7086+ }
7087+
7088+ // We can't query number of shader cores on Intel, use 32 as a placeholder
7089+ // so small convolutions will still choose a smaller tile.
7090+ const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
7091+
7092+ if (elements[0] > 64 && tiles[CONV_SHAPE_128x128] >= shader_core_count * 2) {
7093+ shape = CONV_SHAPE_128x128;
7094+ } else if (elements[0] <= 32 && tiles[CONV_SHAPE_32x256] >= shader_core_count * 2) {
7095+ shape = CONV_SHAPE_32x256;
7096+ } else {
7097+ shape = CONV_SHAPE_64x32;
7098+ }
7099+
69737100 if (src0->type == GGML_TYPE_F32) {
6974- return ctx->device->pipeline_conv2d_f32;
7101+ return ctx->device->pipeline_conv2d_f32[shape] ;
69757102 } else if (src0->type == GGML_TYPE_F16) {
6976- return ctx->device->pipeline_conv2d_f16_f32;
7103+ return ctx->device->pipeline_conv2d_f16_f32[shape] ;
69777104 }
69787105 }
69797106 return nullptr;
@@ -7301,29 +7428,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
73017428 } break;
73027429 case GGML_OP_CONV_2D:
73037430 {
7304- // src0 - kernel: [KW, KH, Cin, Cout]
7305- // src1 - input: [W, H, Cin, N]
7306- // dst - result: [OW, OH, Cout, N]
7307-
7308- // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
7309- auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
7310- return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
7311- };
7312- // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
7313- int64_t W = src1->ne[0];
7314- int64_t H = src1->ne[1];
7315- int64_t KW = src0->ne[0];
7316- int64_t KH = src0->ne[1];
7317- int64_t Cout = src0->ne[3];
7318- int64_t N = src1->ne[3];
7319- int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7320- int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7321- int64_t NPQ = N * OW * OH;
7322-
7323- // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7324- elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
7325- }
7326- break;
7431+ elements = ggml_vk_get_conv_elements(dst);
7432+ } break;
73277433 case GGML_OP_ADD:
73287434 case GGML_OP_SUB:
73297435 case GGML_OP_DIV:
0 commit comments