Skip to content

Commit 364d556

Browse files
committed
vulkan: Hybrid waitForFences/getFenceStatus to reduce fence latency
There seems to be a bubble waking up from waitForFences, which costs a few percent performance and also increased variance in performance. This change inserts an "almost_ready" fence when the graph is about 80% complete and we waitForFences for the almost_ready fence and then spin (with _mm_pauses) waiting for the final fence to be signaled.
1 parent c95fa36 commit 364d556

File tree

1 file changed

+57
-15
lines changed

1 file changed

+57
-15
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <mutex>
2424
#include <future>
2525
#include <thread>
26+
#include <immintrin.h>
2627

2728
#include "ggml-impl.h"
2829
#include "ggml-backend-impl.h"
@@ -770,7 +771,8 @@ struct ggml_backend_vk_context {
770771
ggml_vk_garbage_collector gc;
771772
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
772773
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
773-
vk::Fence fence;
774+
vk::Fence fence, almost_ready_fence;
775+
bool almost_ready_fence_pending {};
774776

775777
vk_buffer buffer_pool[MAX_VK_BUFFERS];
776778

@@ -861,6 +863,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
861863

862864
static void ggml_backend_vk_free(ggml_backend_t backend);
863865

866+
// Wait for ctx->fence to be signaled.
867+
static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
868+
// Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
869+
// during this wait.
870+
if (ctx->almost_ready_fence_pending) {
871+
VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence");
872+
ctx->device->device.resetFences({ ctx->almost_ready_fence });
873+
ctx->almost_ready_fence_pending = false;
874+
}
875+
876+
// Spin (w/pause) waiting for the graph to finish executing.
877+
vk::Result result;
878+
while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) {
879+
if (result != vk::Result::eNotReady) {
880+
fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__);
881+
exit(1);
882+
}
883+
for (uint32_t i = 0; i < 100; ++i) {
884+
_mm_pause();
885+
_mm_pause();
886+
_mm_pause();
887+
_mm_pause();
888+
_mm_pause();
889+
_mm_pause();
890+
_mm_pause();
891+
_mm_pause();
892+
_mm_pause();
893+
_mm_pause();
894+
}
895+
}
896+
ctx->device->device.resetFences({ ctx->fence });
897+
}
898+
864899
// variables to track number of compiles in progress
865900
static uint32_t compile_count = 0;
866901
static std::mutex compile_count_mutex;
@@ -3229,6 +3264,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
32293264
ctx->prealloc_size_split_k = 0;
32303265

32313266
ctx->fence = ctx->device->device.createFence({});
3267+
ctx->almost_ready_fence = ctx->device->device.createFence({});
32323268

32333269
#ifdef GGML_VULKAN_CHECK_RESULTS
32343270
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
@@ -7522,11 +7558,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
75227558
}
75237559
}
75247560

7525-
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
7561+
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready);
75267562

75277563
// Returns true if node has enqueued work into the queue, false otherwise
75287564
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
7529-
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
7565+
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
75307566
if (ggml_is_empty(node) || !node->buffer) {
75317567
return false;
75327568
}
@@ -7898,7 +7934,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
78987934

78997935
ctx->compute_ctx.reset();
79007936

7901-
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
7937+
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready);
79027938
if (!ok) {
79037939
if (node->op == GGML_OP_UNARY) {
79047940
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
@@ -7912,7 +7948,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
79127948
return true;
79137949
}
79147950

7915-
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
7951+
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) {
79167952
ggml_backend_buffer * buf = nullptr;
79177953

79187954
switch (tensor->op) {
@@ -8015,12 +8051,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
80158051
memcpy(cpy.dst, cpy.src, cpy.n);
80168052
}
80178053

8018-
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8054+
if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
8055+
ggml_vk_submit(subctx, ctx->almost_ready_fence);
8056+
ctx->almost_ready_fence_pending = true;
8057+
} else {
8058+
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8059+
}
80198060

80208061
if (use_fence) {
8021-
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
8022-
8023-
ctx->device->device.resetFences({ ctx->fence });
8062+
ggml_vk_wait_for_fence(ctx);
80248063
}
80258064
#ifdef GGML_VULKAN_CHECK_RESULTS
80268065
ggml_vk_check_results_1(tensor);
@@ -8106,6 +8145,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
81068145
ctx->gc.events.clear();
81078146

81088147
ctx->device->device.destroyFence(ctx->fence);
8148+
ctx->device->device.destroyFence(ctx->almost_ready_fence);
81098149
}
81108150

81118151
static int ggml_vk_get_device_count() {
@@ -8452,8 +8492,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
84528492
}
84538493

84548494
ggml_vk_submit(transfer_ctx, ctx->fence);
8455-
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
8456-
ctx->device->device.resetFences({ ctx->fence });
8495+
ggml_vk_wait_for_fence(ctx);
84578496

84588497
for (auto& cpy : transfer_ctx->out_memcpys) {
84598498
memcpy(cpy.dst, cpy.src, cpy.n);
@@ -8472,7 +8511,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
84728511

84738512
uint64_t total_mat_mul_bytes = 0;
84748513
for (int i = 0; i < cgraph->n_nodes; i++) {
8475-
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
8514+
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false);
84768515
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
84778516
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
84788517
}
@@ -8514,11 +8553,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
85148553
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
85158554
}
85168555

8556+
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
8557+
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
85178558
bool submit = (submitted_nodes >= nodes_per_submit) ||
85188559
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
8519-
(i == last_node);
8560+
(i == last_node) ||
8561+
(almost_ready && !ctx->almost_ready_fence_pending);
85208562

8521-
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
8563+
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
85228564

85238565
if (enqueued) {
85248566
++submitted_nodes;
@@ -8530,7 +8572,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
85308572
#endif
85318573
}
85328574

8533-
if (submit) {
8575+
if (submit && enqueued) {
85348576
first_node_in_batch = true;
85358577
submitted_nodes = 0;
85368578
mul_mat_bytes = 0;

0 commit comments

Comments
 (0)