Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 14 additions & 12 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3603,26 +3603,25 @@ static void ggml_vk_load_shaders(vk_device& device) {
device, device->pipeline_##name##type_suffix[s], #name #type_suffix, \
name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \
sizeof(vk_op_##name##_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
#define CREATE_CONVS(spv_suffix) \
CREATE_CONV(conv2d, _f32, spv_suffix) \
CREATE_CONV(conv2d, _f16_f32, spv_suffix) \
if (device->properties.limits.maxPushConstantsSize >= sizeof(vk_op_conv_transpose_2d_push_constants)) { \
CREATE_CONV(conv_transpose_2d, _f32, spv_suffix) \
CREATE_CONV(conv_transpose_2d, _f16_f32, spv_suffix) \
}
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
if (device->coopmat2) {
CREATE_CONV(conv2d, _f32, _cm2)
CREATE_CONV(conv2d, _f16_f32, _cm2)
CREATE_CONV(conv_transpose_2d, _f32, _cm2)
CREATE_CONV(conv_transpose_2d, _f16_f32, _cm2)
CREATE_CONVS(_cm2)
} else
#endif
if (conv2d_UNROLL) {
CREATE_CONV(conv2d, _f32, _unroll)
CREATE_CONV(conv2d, _f16_f32, _unroll)
CREATE_CONV(conv_transpose_2d, _f32, _unroll)
CREATE_CONV(conv_transpose_2d, _f16_f32, _unroll)
CREATE_CONVS(_unroll)
} else {
CREATE_CONV(conv2d, _f32, )
CREATE_CONV(conv2d, _f16_f32, )
CREATE_CONV(conv_transpose_2d, _f32, )
CREATE_CONV(conv_transpose_2d, _f16_f32, )
CREATE_CONVS( )
}
#undef CREATE_CONV
#undef CREATE_CONVS
}

ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
Expand Down Expand Up @@ -12722,6 +12721,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
// Op is disabled for Apple because it segfaults at pipeline create time on MoltenVK
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
const vk_device& device = ggml_vk_get_device(ctx->device);
if (op->op == GGML_OP_CONV_TRANSPOSE_2D && !device->pipeline_conv_transpose_2d_f32[0]) {
return false;
}
// Channel-contiguous format is not supported yet.
return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
op->src[1]->type == GGML_TYPE_F32 &&
Expand Down
38 changes: 31 additions & 7 deletions ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
layout(binding = 0) readonly buffer A {
A_TYPE knl_data[];
}; // src0 - kernel: [KW, KH, Cin, Cout]
}; // src0 - kernel: [KW, KH, Cin, Cout] for conv_2d, [KW, KH, Cout, Cin] for conv_transposed_2d

layout(binding = 1) readonly buffer B {
B_TYPE src_data[];
Expand Down Expand Up @@ -66,6 +66,10 @@ layout(push_constant) uniform parameter {
uint32_t KWKHmp; uint32_t KWKHL;
uint32_t OWmp; uint32_t OWL;
uint32_t OWOHmp; uint32_t OWOHL;
#ifdef TRANSPOSE
uint32_t s0mp; uint32_t s0L;
uint32_t s1mp; uint32_t s1L;
#endif
}

p;
Expand Down Expand Up @@ -225,10 +229,16 @@ void main() {
uint32_t B_ly = r_offset + Ar;
uint32_t B_lx = Ac;
uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1);
float val = knl_data[knl_idx];
float val;
if (K_idx >= K || CRS_idx_a >= CRS) {
val = 0.0;
} else {
#ifdef TRANSPOSE
uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03, K * CRS - 1);
#else
uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1);
#endif
val = knl_data[knl_idx];
}
Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
}
Expand Down Expand Up @@ -267,13 +277,27 @@ void main() {
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
#endif

#ifdef TRANSPOSE
uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1;
uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0;
uint32_t H_idx = fastdiv(H_idx_x_s1, p.s1mp, p.s1L);
uint32_t W_idx = fastdiv(W_idx_x_s0, p.s0mp, p.s0L);
#else
uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
uint32_t src_idx =
min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
float val = src_data[src_idx];
if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
#endif
float val;
if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
|| int32_t(H_idx) < 0 || H_idx >= p.H || int32_t(W_idx) < 0 || W_idx >= p.W
#ifdef TRANSPOSE
|| (H_idx_x_s1 - H_idx * p.s1 != 0) || (W_idx_x_s0 - W_idx * p.s0 != 0)
#endif
) {
val = 0.0;
} else {
uint32_t src_idx =
min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
val = src_data[src_idx];
}
Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
}
Expand Down
Loading
Loading