Skip to content

Commit 5d09cb2

Browse files
committed
https://github.com/ggerganov/llama.cpp/issues/11339
Eval bug: vulkan: regression: vram usage increased
1 parent 1b2f685 commit 5d09cb2

File tree

1 file changed

+41
-23
lines changed

1 file changed

+41
-23
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ struct vk_pipeline_struct {
9393
uint32_t parameter_count;
9494
std::array<uint32_t, 3> wg_denoms;
9595
uint32_t align;
96+
// set to true to request the pipeline is compiled after the dryrun
97+
bool needed {};
98+
// set to true when the shader has been compiled
99+
bool compiled {};
96100
};
97101

98102
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@@ -186,16 +190,19 @@ struct vk_device_struct {
186190
bool mul_mat_id_m;
187191
bool mul_mat_id_s;
188192

189-
vk_matmul_pipeline pipeline_matmul_f32;
190-
vk_matmul_pipeline pipeline_matmul_f32_f16;
193+
// set to true to indicate that some shaders need to be compiled after the dryrun
194+
bool need_compiles {};
195+
196+
vk_matmul_pipeline pipeline_matmul_f32 {};
197+
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
191198
vk_matmul_pipeline2 pipeline_matmul_f16;
192199
vk_matmul_pipeline2 pipeline_matmul_f16_f32;
193200
vk_pipeline pipeline_matmul_split_k_reduce;
194201

195202
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
196203
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
197204

198-
vk_matmul_pipeline pipeline_matmul_id_f32;
205+
vk_matmul_pipeline pipeline_matmul_id_f32 {};
199206
vk_matmul_pipeline2 pipeline_matmul_id_f16;
200207
vk_matmul_pipeline2 pipeline_matmul_id_f16_f32;
201208

@@ -758,13 +765,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
758765
GGML_ASSERT(parameter_count > 0);
759766
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
760767

761-
pipeline = std::make_shared<vk_pipeline_struct>();
762-
pipeline->name = name;
763-
pipeline->parameter_count = parameter_count;
764-
pipeline->push_constant_size = push_constant_size;
765-
pipeline->wg_denoms = wg_denoms;
766-
pipeline->align = align;
767-
768768
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
769769
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
770770

@@ -833,6 +833,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
833833
}
834834

835835
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
836+
pipeline->compiled = true;
836837

837838
{
838839
std::lock_guard<std::mutex> guard(device->mutex);
@@ -844,11 +845,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
844845
assert(compile_count > 0);
845846
compile_count--;
846847

847-
// "Progress bar" for shader compiles
848-
static uint32_t total_compile_count = 0;
849-
if ((total_compile_count++ % 10) == 0) {
850-
std::cerr << ".";
851-
}
852848
}
853849
compile_count_cond.notify_all();
854850
}
@@ -874,6 +870,10 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
874870
static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
875871
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
876872
device->pipeline_descriptor_set_requirements[pipeline->name] += n;
873+
if (!pipeline->compiled) {
874+
pipeline->needed = true;
875+
device->need_compiles = true;
876+
}
877877
}
878878

879879
static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
@@ -1356,8 +1356,6 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
13561356
static void ggml_vk_load_shaders(vk_device& device) {
13571357
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
13581358

1359-
std::cerr << "ggml_vulkan: Compiling shaders";
1360-
13611359
// some shaders require the subgroup size to be 16 or larger
13621360
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
13631361

@@ -1494,13 +1492,30 @@ static void ggml_vk_load_shaders(vk_device& device) {
14941492
}
14951493
}
14961494

1497-
device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1498-
device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1499-
1500-
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1501-
1495+
if (!device->pipeline_matmul_f32) {
1496+
device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1497+
}
1498+
if (!device->pipeline_matmul_f32_f16) {
1499+
device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1500+
}
1501+
if (!device->pipeline_matmul_id_f32) {
1502+
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1503+
}
15021504
std::vector<std::future<void>> compiles;
15031505
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align, bool disable_robustness = false) {
1506+
if (!pipeline) {
1507+
pipeline = std::make_shared<vk_pipeline_struct>();
1508+
pipeline->name = name;
1509+
pipeline->parameter_count = parameter_count;
1510+
pipeline->push_constant_size = push_constant_size;
1511+
pipeline->wg_denoms = wg_denoms;
1512+
pipeline->align = align;
1513+
}
1514+
1515+
if (!pipeline->needed || pipeline->compiled) {
1516+
return;
1517+
}
1518+
15041519
{
15051520
// wait until fewer than N compiles are in progress
15061521
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@@ -1940,7 +1955,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
19401955
for (auto &c : compiles) {
19411956
c.wait();
19421957
}
1943-
std::cerr << "Done!" << std::endl;
1958+
device->need_compiles = false;
19441959
}
19451960

19461961
static vk_device ggml_vk_get_device(size_t idx) {
@@ -7225,6 +7240,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
72257240
for (int i = 0; i < cgraph->n_nodes; i++) {
72267241
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
72277242
}
7243+
if (ctx->device->need_compiles) {
7244+
ggml_vk_load_shaders(ctx->device);
7245+
}
72287246
ggml_vk_preallocate_buffers(ctx);
72297247
ggml_pipeline_allocate_descriptor_sets(ctx->device);
72307248

0 commit comments

Comments
 (0)