Skip to content

Commit 216b766

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build-riscv-native.yml # .github/workflows/build.yml # ci/README.md # ci/run.sh # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # tests/test-backend-ops.cpp
2 parents b5931c9 + a20d810 commit 216b766

File tree

7 files changed

+91
-24
lines changed

7 files changed

+91
-24
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,6 +1201,14 @@ struct vk_staging_memcpy {
12011201
size_t n;
12021202
};
12031203

1204+
struct vk_staging_memset {
1205+
vk_staging_memset(void * _dst, uint32_t _val, size_t _n) : dst(_dst), val(_val), n(_n) {}
1206+
1207+
void * dst;
1208+
uint32_t val;
1209+
size_t n;
1210+
};
1211+
12041212
struct vk_context_struct {
12051213
vk_submission * s;
12061214
std::vector<vk_sequence> seqs;
@@ -1209,6 +1217,7 @@ struct vk_context_struct {
12091217

12101218
std::vector<vk_staging_memcpy> in_memcpys;
12111219
std::vector<vk_staging_memcpy> out_memcpys;
1220+
std::vector<vk_staging_memset> memsets;
12121221

12131222
vk_command_pool * p {};
12141223
};
@@ -1600,7 +1609,9 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
16001609
}
16011610

16021611
vk::ComputePipelineCreateInfo compute_pipeline_create_info(
1603-
vk::PipelineCreateFlags{},
1612+
device->pipeline_executable_properties_support ?
1613+
vk::PipelineCreateFlagBits::eCaptureStatisticsKHR :
1614+
vk::PipelineCreateFlags{},
16041615
pipeline_shader_create_info,
16051616
pipeline->layout);
16061617

@@ -3396,7 +3407,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
33963407
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
33973408
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
33983409

3399-
CREATE_UNARY(exp)
34003410
CREATE_UNARY(gelu)
34013411
CREATE_UNARY(gelu_erf)
34023412
CREATE_UNARY(gelu_quick)
@@ -3408,6 +3418,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
34083418
CREATE_UNARY(hardswish)
34093419
#undef CREATE_UNARY
34103420

3421+
#define CREATE_UNARY_RTE(name) \
3422+
if (device->float_controls_rte_fp16) { \
3423+
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
3424+
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16_rte", name ## _f16_rte_len, name ## _f16_rte_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
3425+
} else { \
3426+
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
3427+
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
3428+
}
3429+
CREATE_UNARY_RTE(exp)
3430+
#undef CREATE_UNARY_RTE
3431+
34113432
#define CREATE_GLU(name) \
34123433
if (device->float_controls_rte_fp16) { \
34133434
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
@@ -5224,6 +5245,14 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
52245245
}
52255246
}
52265247

5248+
static void deferred_memset(void * dst, uint32_t val, size_t size, std::vector<vk_staging_memset>* memsets = nullptr) {
5249+
if (memsets == nullptr) {
5250+
memset(dst, val, size);
5251+
} else {
5252+
memsets->emplace_back(dst, val, size);
5253+
}
5254+
}
5255+
52275256
static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
52285257
if (device->sync_staging == nullptr || device->sync_staging->size < size) {
52295258
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
@@ -5419,6 +5448,10 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
54195448
memcpy(cpy.dst, cpy.src, cpy.n);
54205449
}
54215450

5451+
for (auto& mset : subctx->memsets) {
5452+
memset(mset.dst, mset.val, mset.n);
5453+
}
5454+
54225455
ggml_vk_submit(subctx, dst->device->fence);
54235456
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
54245457
dst->device->device.resetFences({ dst->device->fence });
@@ -5558,12 +5591,25 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
55585591
static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
55595592
VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")");
55605593

5594+
if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
5595+
dst->device->uma) {
5596+
deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets);
5597+
return;
5598+
}
5599+
5600+
// Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers
55615601
ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
55625602
}
55635603

55645604
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
55655605
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
55665606

5607+
if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
5608+
dst->device->uma) {
5609+
memset((uint8_t*)dst->ptr + offset, c, size);
5610+
return;
5611+
}
5612+
55675613
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
55685614
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
55695615
ggml_vk_ctx_begin(dst->device, subctx);
@@ -11198,6 +11244,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1119811244
memcpy(cpy.dst, cpy.src, cpy.n);
1119911245
}
1120011246

11247+
for (auto& mset : subctx->memsets) {
11248+
memset(mset.dst, mset.val, mset.n);
11249+
}
11250+
1120111251
if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
1120211252
ggml_vk_submit(subctx, ctx->almost_ready_fence);
1120311253
ctx->almost_ready_fence_pending = true;
@@ -11220,6 +11270,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1122011270
}
1122111271
subctx->in_memcpys.clear();
1122211272
subctx->out_memcpys.clear();
11273+
subctx->memsets.clear();
1122311274
}
1122411275

1122511276
return true;

ggml/src/ggml-vulkan/vulkan-shaders/exp.comp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#version 450
22

3+
#include "rte.comp"
34
#include "generic_head.comp"
45
#include "types.comp"
56

ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,22 @@
3131
#include "types.comp"
3232

3333
#ifndef LOAD_VEC_A
34-
#define LOAD_VEC_A 2
34+
#define LOAD_VEC_A 1
3535
#endif
3636
#ifndef LOAD_VEC_B
37-
#define LOAD_VEC_B 2
37+
#define LOAD_VEC_B 1
38+
#endif
39+
40+
// Load 2 values at once without affecting index calculations through LOAD_VEC
41+
#if (defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)) && !defined(ALIGNED)
42+
#define LOAD_VEC_BATCH_A 2
43+
#else
44+
#define LOAD_VEC_BATCH_A 1
45+
#endif
46+
#if !defined(ALIGNED)
47+
#define LOAD_VEC_BATCH_B 2
48+
#else
49+
#define LOAD_VEC_BATCH_B 1
3850
#endif
3951

4052
#if !defined(TO_FLOAT_TYPE)
@@ -236,13 +248,13 @@ void main() {
236248
const uint warp_r = warp_i % (BM / WM);
237249
const uint warp_c = warp_i / (BM / WM);
238250

239-
const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A);
240-
const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A);
241-
const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B);
242-
const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B);
251+
const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A);
252+
const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A);
253+
const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B);
254+
const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B);
243255

244-
const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A / BK;
245-
const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK;
256+
const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A * LOAD_VEC_BATCH_A / BK;
257+
const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B * LOAD_VEC_BATCH_B / BK;
246258

247259
#ifdef MUL_MAT_ID
248260
#ifdef MUL_MAT_ID_USE_SUBGROUPS

ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
1414
FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(data_a[idx]);
1515
buf_a[buf_idx ] = aa.xy;
1616
buf_a[buf_idx + 1] = aa.zw;
17-
#else // LOAD_VEC_A == 2
18-
const uint idx = pos_a * 2 + col * p.stride_a + row * 2;
17+
#else // LOAD_VEC_BATCH_A == 2
18+
const uint idx = pos_a + col * p.stride_a + row * 2;
1919
const uint buf_idx = col * SHMEM_STRIDE + row;
2020
if (idx_m < p.M && block + row * 2 + 1 < end_k) {
2121
buf_a[buf_idx] = FLOAT_TYPE_VEC2(data_a[idx],
@@ -33,8 +33,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
3333
FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_a[idx]));
3434
buf_a[buf_idx ] = aa.xy;
3535
buf_a[buf_idx + 1] = aa.zw;
36-
#else // LOAD_VEC_A == 2
37-
const uint idx = pos_a * 2 + col * p.stride_a + row * 2;
36+
#else // LOAD_VEC_BATCH_A == 2
37+
const uint idx = pos_a + col * p.stride_a + row * 2;
3838
const uint buf_idx = col * SHMEM_STRIDE + row;
3939
if (idx_m < p.M && block + row * 2 + 1 < end_k) {
4040
buf_a[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_a[idx]),
@@ -500,8 +500,8 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
500500
#endif
501501
buf_b[buf_idx + 0] = bb.xy;
502502
buf_b[buf_idx + 1] = bb.zw;
503-
#else // LOAD_VEC_B == 2
504-
const uint idx = pos_b * 2 + col * p.stride_b + row * 2;
503+
#else // LOAD_VEC_BATCH_B == 2
504+
const uint idx = pos_b + col * p.stride_b + row * 2;
505505
const uint buf_idx = col * SHMEM_STRIDE + row;
506506
if (idx_n < p.N && block + row * 2 + 1 < end_k) {
507507
buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]),
@@ -536,17 +536,17 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
536536
#endif
537537
buf_b[buf_idx + 0] = bb.xy;
538538
buf_b[buf_idx + 1] = bb.zw;
539-
#else // LOAD_VEC_B == 2
539+
#else // LOAD_VEC_BATCH_B == 2
540540
const uint row_i = ic * BN + col;
541541
const uint buf_idx = col * SHMEM_STRIDE + row;
542542
if (row_i < _ne1 && block + row * 2 + 1 < end_k) {
543543
const u16vec2 row_idx = row_ids[col];
544-
const uint idx = pos_b * 2 + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
544+
const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
545545
buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]),
546546
TO_FLOAT_TYPE(data_b[idx + 1]));
547547
} else if (row_i < _ne1 && block + row * 2 < end_k) {
548548
const u16vec2 row_idx = row_ids[col];
549-
const uint idx = pos_b * 2 + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
549+
const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
550550
buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
551551
} else {
552552
buf_b[buf_idx] = FLOAT_TYPE_VEC2(0.0f);

ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
468468

469469
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
470470
// For unaligned, load one at a time for f32/f16, or two at a time for quants
471-
std::string load_vec_a_unaligned = coopmat2 ? "1" : (tname == "f32" || tname == "f16" || tname == "bf16") ? "2" : load_vec_quant;
471+
std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? "1" : load_vec_quant;
472472
// For aligned matmul loads
473473
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? load_vec : load_vec_quant;
474474

@@ -718,8 +718,11 @@ void process_shaders() {
718718

719719
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
720720

721-
string_to_spv("exp_f16", "exp.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
722-
string_to_spv("exp_f32", "exp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
721+
for (auto rte : {false, true}) {
722+
std::string suffix = rte ? "_rte" : "";
723+
string_to_spv("exp_f16" + suffix, "exp.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
724+
string_to_spv("exp_f32" + suffix, "exp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"} , {"RTE16", rte ? "1" : "0"}});
725+
}
723726
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
724727
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
725728
string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});

koboldcpp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
extra_images_max = 4
6565

6666
# global vars
67-
KcppVersion = "1.99.1"
67+
KcppVersion = "1.99.2"
6868
showdebug = True
6969
kcpp_instance = None #global running instance
7070
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}

tools/main/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ int main(int argc, char ** argv) {
179179
return 1;
180180
}
181181

182-
// Start the non-batch threadpool in the paused state
182+
// start the non-batch threadpool in the paused state
183183
tpp.paused = true;
184184
}
185185

0 commit comments

Comments
 (0)