Skip to content

Commit 64a367b

Browse files
committed
vulkan: 64-bit im2col
Add variants of the im2col shaders that use buffer_device_address/buffer_reference, and use 64-bit address calculations. This is needed for large convolutions used in stable-diffusion.cpp.
1 parent 3f81b4e commit 64a367b

File tree

6 files changed

+113
-26
lines changed

6 files changed

+113
-26
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,8 @@ struct vk_device_struct {
406406
bool subgroup_ballot;
407407
bool subgroup_clustered;
408408
bool multi_add;
409+
bool shader_int64;
410+
bool buffer_device_address;
409411

410412
bool add_rms_fusion;
411413
uint32_t partials_binding_alignment;
@@ -653,6 +655,7 @@ struct vk_buffer_struct {
653655
vk::MemoryPropertyFlags memory_property_flags;
654656
void * ptr;
655657
size_t size = 0;
658+
vk::DeviceAddress bda_addr {};
656659

657660
vk_device device;
658661

@@ -985,6 +988,7 @@ struct vk_op_argsort_push_constants {
985988
};
986989

987990
struct vk_op_im2col_push_constants {
991+
uint64_t dst_addr;
988992
uint32_t batch_offset; uint32_t offset_delta;
989993
uint32_t IC;
990994
uint32_t IW; uint32_t IH;
@@ -998,6 +1002,7 @@ struct vk_op_im2col_push_constants {
9981002
};
9991003

10001004
struct vk_op_im2col_3d_push_constants {
1005+
uint64_t dst_addr;
10011006
uint32_t nb10;
10021007
uint32_t nb11;
10031008
uint32_t nb12;
@@ -2010,10 +2015,17 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
20102015
return buf;
20112016
}
20122017

2018+
vk::BufferUsageFlags usage_flags = vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst;
2019+
vk::MemoryAllocateFlags mem_flags {};
2020+
if (device->buffer_device_address) {
2021+
usage_flags |= vk::BufferUsageFlagBits::eShaderDeviceAddress;
2022+
mem_flags |= vk::MemoryAllocateFlagBits::eDeviceAddress;
2023+
}
2024+
20132025
vk::BufferCreateInfo buffer_create_info{
20142026
vk::BufferCreateFlags(),
20152027
size,
2016-
vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
2028+
usage_flags,
20172029
vk::SharingMode::eExclusive,
20182030
0,
20192031
nullptr,
@@ -2025,6 +2037,8 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
20252037

20262038
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
20272039

2040+
const vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags };
2041+
20282042
for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) {
20292043
const auto & req_flags = *it;
20302044

@@ -2036,7 +2050,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
20362050
buf->memory_property_flags = req_flags;
20372051

20382052
try {
2039-
buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index });
2053+
buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index, &mem_flags_info });
20402054
break;
20412055
} catch (const vk::SystemError& e) {
20422056
// loop and retry
@@ -2064,6 +2078,11 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
20642078
buf->device = device;
20652079
buf->size = size;
20662080

2081+
if (device->buffer_device_address) {
2082+
const vk::BufferDeviceAddressInfo addressInfo(buf->buffer);
2083+
buf->bda_addr = device->device.getBufferAddress(addressInfo);
2084+
}
2085+
20672086
#ifdef GGML_VULKAN_MEMORY_DEBUG
20682087
device->memory_logger->log_allocation(buf, size);
20692088
#endif
@@ -3530,14 +3549,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
35303549

35313550
ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1);
35323551

3533-
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
3534-
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32, "im2col_3d_f32", im2col_3d_f32_len, im2col_3d_f32_data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true);
3535-
if (device->float_controls_rte_fp16) {
3536-
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
3537-
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16_rte_len, im2col_3d_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true);
3552+
#define IM2COL(bda) \
3553+
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32 ## bda ## _len, im2col_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \
3554+
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32, "im2col_3d_f32", im2col_3d_f32 ## bda ## _len, im2col_3d_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \
3555+
if (device->float_controls_rte_fp16) { \
3556+
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte ## bda ## _len, im2col_f32_f16_rte ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \
3557+
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16_rte ## bda ## _len, im2col_3d_f32_f16_rte ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \
3558+
} else { \
3559+
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16 ## bda ## _len, im2col_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \
3560+
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16 ## bda ## _len, im2col_3d_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \
3561+
}
3562+
if (device->shader_int64 && device->buffer_device_address) {
3563+
IM2COL(_bda)
35383564
} else {
3539-
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
3540-
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16_len, im2col_3d_f32_f16_data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true);
3565+
IM2COL()
35413566
}
35423567

35433568
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
@@ -4015,6 +4040,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
40154040
device->vendor_id != VK_VENDOR_ID_INTEL &&
40164041
getenv("GGML_VK_DISABLE_MULTI_ADD") == nullptr;
40174042

4043+
device->shader_int64 = device_features2.features.shaderInt64;
4044+
device->buffer_device_address = vk12_features.bufferDeviceAddress;
4045+
40184046
if (device->subgroup_size_control) {
40194047
device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
40204048
device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize;
@@ -9443,7 +9471,13 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
94439471

94449472
const uint32_t pelements = OW * KW * KH;
94459473

9474+
const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
9475+
const vk_buffer d_buf = d_buf_ctx->dev_buffer;
9476+
9477+
const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs;
9478+
94469479
ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL, {
9480+
dst_addr,
94479481
batch_offset, offset_delta,
94489482
IC, IW, IH, OW, OH, KW, KH,
94499483
pelements,
@@ -9479,8 +9513,14 @@ static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx,
94799513
const int64_t OH = ne2;
94809514
const int64_t OW = ne1;
94819515

9516+
const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
9517+
const vk_buffer d_buf = d_buf_ctx->dev_buffer;
9518+
9519+
const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs;
9520+
94829521
vk_op_im2col_3d_push_constants pc {};
94839522

9523+
pc.dst_addr = dst_addr;
94849524
pc.nb10 = nb10 / ggml_type_size(src1->type);
94859525
pc.nb11 = nb11 / ggml_type_size(src1->type);
94869526
pc.nb12 = nb12 / ggml_type_size(src1->type);

ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@
55

66
#include "rte.comp"
77

8+
#include "types.comp"
9+
810
layout (push_constant) uniform parameter
911
{
12+
BDA_STORAGE_T dst_addr;
1013
uint batch_offset; uint offset_delta;
1114
uint IC;
1215
uint IW; uint IH;
@@ -19,8 +22,6 @@ layout (push_constant) uniform parameter
1922
int d0; int d1;
2023
} p;
2124

22-
#include "types.comp"
23-
2425
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
2526

2627
const uint NUM_ITER = 512 / BLOCK_SIZE;
@@ -30,6 +31,10 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
3031
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
3132
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
3233

34+
#if BDA
35+
layout (buffer_reference) buffer D_ptr {D_TYPE d;};
36+
#endif
37+
3338
void main() {
3439
const uint gidx = gl_GlobalInvocationID.x;
3540

@@ -38,7 +43,7 @@ void main() {
3843
const uint ic = gl_GlobalInvocationID.z % p.IC;
3944

4045
const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
41-
const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH);
46+
const BDA_OFFSET_T dst_base = ((BDA_OFFSET_T(batch) * p.OH + oh) * p.OW) * p.CHW + BDA_OFFSET_T(ic) * (p.KW * p.KH);
4247
const int oh_s1 = int(oh) * p.s1;
4348
const uint ksize = p.OW * p.KH;
4449

@@ -50,7 +55,7 @@ void main() {
5055
uint current_ix = rem % p.OW;
5156

5257
A_TYPE values[NUM_ITER];
53-
uint offset_dst[NUM_ITER];
58+
BDA_OFFSET_T offset_dst[NUM_ITER];
5459
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
5560
values[idx] = A_TYPE(0);
5661
}
@@ -66,7 +71,7 @@ void main() {
6671
const uint iiw = current_ix * p.s0 + current_kx * p.d0 - p.p0;
6772
const uint iih = oh_s1 + current_ky * p.d1 - p.p1;
6873

69-
offset_dst[idx] = dst_base + current_ix * p.CHW + current_ky * p.KW + current_kx;
74+
offset_dst[idx] = dst_base + BDA_OFFSET_T(current_ix) * p.CHW + current_ky * p.KW + current_kx;
7075

7176
if ((iih < p.IH) && (iiw < p.IW)) {
7277
values[idx] = data_a[src_base + iih * p.IW + iiw];
@@ -89,7 +94,11 @@ void main() {
8994
continue;
9095
}
9196

97+
#if BDA
98+
D_ptr dst_addr = D_ptr(p.dst_addr + D_SIZE * offset_dst[idx]);
99+
dst_addr.d = D_TYPE(values[idx]);
100+
#else
92101
data_d[offset_dst[idx]] = D_TYPE(values[idx]);
102+
#endif
93103
}
94-
95104
}

ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66

77
#include "rte.comp"
88

9+
#include "types.comp"
10+
911
layout (push_constant) uniform parameter
1012
{
13+
BDA_STORAGE_T dst_addr;
1114
uint32_t nb10;
1215
uint32_t nb11;
1316
uint32_t nb12;
@@ -38,8 +41,6 @@ layout (push_constant) uniform parameter
3841
uint32_t misalign_offsets;
3942
} p;
4043

41-
#include "types.comp"
42-
4344
uint get_aoffset() { return p.misalign_offsets >> 16; }
4445
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
4546

@@ -50,6 +51,10 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
5051
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
5152
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
5253

54+
#if BDA
55+
layout (buffer_reference) buffer D_ptr {D_TYPE d;};
56+
#endif
57+
5358
void main() {
5459
const uint32_t i = gl_GlobalInvocationID.x;
5560

@@ -100,13 +105,22 @@ void main() {
100105
const uint32_t iih = ioh * s1 + ikh * d1 - p1;
101106
const uint32_t iid = iod * s2 + ikd * d2 - p2;
102107

103-
const uint32_t offset_dst = in_*OD_OH_OW_IC_KD_KH_KW + iod*OH_OW_IC_KD_KH_KW + ioh*OW_IC_KD_KH_KW + iow*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
108+
const BDA_OFFSET_T offset_dst = BDA_OFFSET_T(in_)*OD_OH_OW_IC_KD_KH_KW + BDA_OFFSET_T(iod)*OH_OW_IC_KD_KH_KW + BDA_OFFSET_T(ioh)*OW_IC_KD_KH_KW + BDA_OFFSET_T(iow)*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
104109

110+
const uint32_t offset_src = (in_*IC + iic)*nb13 + iid*nb12 + iih*nb11 + iiw*nb10;
111+
#if BDA
112+
D_ptr dst_addr = D_ptr(p.dst_addr + D_SIZE * offset_dst);
113+
if (iih >= IH || iiw >= IW || iid >= ID) {
114+
dst_addr.d = D_TYPE(0.0f);
115+
} else {
116+
dst_addr.d = D_TYPE(data_a[offset_src + get_aoffset()]);
117+
}
118+
#else
105119
if (iih >= IH || iiw >= IW || iid >= ID) {
106120
data_d[offset_dst + get_doffset()] = D_TYPE(0.0f);
107121
} else {
108-
const uint32_t offset_src = (in_*IC + iic)*nb13 + iid*nb12 + iih*nb11 + iiw*nb10;
109122
data_d[offset_dst + get_doffset()] = D_TYPE(data_a[offset_src + get_aoffset()]);
110123
}
124+
#endif
111125
}
112126
}

ggml/src/ggml-vulkan/vulkan-shaders/types.comp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1447,4 +1447,19 @@ float e8m0_to_fp32(uint8_t x) {
14471447
return uintBitsToFloat(bits);
14481448
}
14491449

1450+
#if BDA
1451+
1452+
#extension GL_EXT_buffer_reference : enable
1453+
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
1454+
1455+
#define BDA_STORAGE_T uint64_t
1456+
#define BDA_OFFSET_T uint64_t
1457+
1458+
#else
1459+
1460+
#define BDA_STORAGE_T uvec2
1461+
#define BDA_OFFSET_T uint
1462+
1463+
#endif
1464+
14501465
#endif // !defined(GGML_TYPES_COMP)

ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -775,13 +775,15 @@ void process_shaders() {
775775
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
776776
string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
777777

778-
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
779-
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
780-
string_to_spv("im2col_f32_f16_rte", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}));
781-
782-
string_to_spv("im2col_3d_f32", "im2col_3d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
783-
string_to_spv("im2col_3d_f32_f16", "im2col_3d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
784-
string_to_spv("im2col_3d_f32_f16_rte", "im2col_3d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}));
778+
for (std::string dim_str : {"", "_3d"}) {
779+
for (bool bda : {false, true}) {
780+
std::string bda_str = bda ? "_bda" : "";
781+
std::string bda_def = bda ? "1" : "0";
782+
string_to_spv("im2col" + dim_str + "_f32" + bda_str, "im2col" + dim_str + ".comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"D_SIZE", "4"}, {"BDA", bda_def}}));
783+
string_to_spv("im2col" + dim_str + "_f32_f16" + bda_str, "im2col" + dim_str + ".comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"D_SIZE", "2"}, {"BDA", bda_def}}));
784+
string_to_spv("im2col" + dim_str + "_f32_f16_rte" + bda_str, "im2col" + dim_str + ".comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"D_SIZE", "2"}, {"RTE16", "1"}, {"BDA", bda_def}}));
785+
}
786+
}
785787

786788
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
787789

tests/test-backend-ops.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5753,6 +5753,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
57535753
}
57545754
}
57555755

5756+
#if 0
5757+
// >4GB im2col destination. Too slow to run by default.
5758+
// Test cases taken from Wan2.1 T2V 1.3B.
5759+
test_cases.emplace_back(new test_im2col (GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {832, 480, 192, 4}, {3, 3, 192, 96}, 1, 1, 1, 1, 1, 1, true));
5760+
test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {834, 482, 6, 96}, {3, 3,3, 9216}, 96, 1, 1, 1, 0, 0, 0, 1, 1, 1, false));
5761+
#endif
5762+
57565763
// im2col 1D
57575764
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
57585765
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));

0 commit comments

Comments
 (0)