2424#include <future>
2525#include <thread>
2626
27+ #if defined(_MSC_VER)
28+ # define NOMINMAX 1
29+ # include <windows.h>
30+ # define YIELD() YieldProcessor()
31+ #elif defined(__clang__) || defined(__GNUC__)
32+ # if defined(__x86_64__) ||defined(__i386__)
33+ #  include <immintrin.h>
34+ #  define YIELD() _mm_pause()
35+ # elif defined(__arm__) || defined(__aarch64__)
36+ #  if defined(__clang__)
37+ #   include <arm_acle.h>
38+ #   define YIELD() __yield()
39+ #  else
40+ #   define YIELD() asm volatile("yield")
41+ #  endif
42+ # endif
43+ #endif
44+ 
45+ #if !defined(YIELD)
46+ #define YIELD()
47+ #endif
48+ 
2749#include "ggml-impl.h"
2850#include "ggml-backend-impl.h"
2951
@@ -787,7 +809,8 @@ struct ggml_backend_vk_context {
787809    ggml_vk_garbage_collector gc;
788810    size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
789811    vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
790-     vk::Fence fence;
812+     vk::Fence fence, almost_ready_fence;
813+     bool almost_ready_fence_pending {};
791814
792815    vk_buffer buffer_pool[MAX_VK_BUFFERS];
793816
@@ -878,6 +901,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
878901
879902static void ggml_backend_vk_free(ggml_backend_t backend);
880903
904+ // Wait for ctx->fence to be signaled.
905+ static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
906+     // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
907+     // during this wait.
908+     if (ctx->almost_ready_fence_pending) {
909+         VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence");
910+         ctx->device->device.resetFences({ ctx->almost_ready_fence });
911+         ctx->almost_ready_fence_pending = false;
912+     }
913+ 
914+     // Spin (w/pause) waiting for the graph to finish executing.
915+     vk::Result result;
916+     while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) {
917+         if (result != vk::Result::eNotReady) {
918+             fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__);
919+             exit(1);
920+         }
921+         for (uint32_t i = 0; i < 100; ++i) {
922+             YIELD();
923+             YIELD();
924+             YIELD();
925+             YIELD();
926+             YIELD();
927+             YIELD();
928+             YIELD();
929+             YIELD();
930+             YIELD();
931+             YIELD();
932+         }
933+     }
934+     ctx->device->device.resetFences({ ctx->fence });
935+ }
936+ 
881937// variables to track number of compiles in progress
882938static uint32_t compile_count = 0;
883939static std::mutex compile_count_mutex;
@@ -3355,6 +3411,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
33553411    ctx->prealloc_size_split_k = 0;
33563412
33573413    ctx->fence = ctx->device->device.createFence({});
3414+     ctx->almost_ready_fence = ctx->device->device.createFence({});
33583415
33593416#ifdef GGML_VULKAN_CHECK_RESULTS
33603417    const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
@@ -7959,11 +8016,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
79598016    }
79608017}
79618018
7962- static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
8019+ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready );
79638020
79648021// Returns true if node has enqueued work into the queue, false otherwise
79658022// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
7966- static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
8023+ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool  submit){
79678024    if (ggml_is_empty(node) || !node->buffer) {
79688025        return false;
79698026    }
@@ -8335,7 +8392,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
83358392
83368393        ctx->compute_ctx.reset();
83378394
8338-         bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
8395+         bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready );
83398396        if (!ok) {
83408397            if (node->op == GGML_OP_UNARY) {
83418398                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
@@ -8349,7 +8406,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
83498406    return true;
83508407}
83518408
8352- static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true) {
8409+ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false)  {
83538410    ggml_backend_buffer * buf = nullptr;
83548411
83558412    switch (tensor->op) {
@@ -8452,12 +8509,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
84528509            memcpy(cpy.dst, cpy.src, cpy.n);
84538510        }
84548511
8455-         ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8512+         if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
8513+             ggml_vk_submit(subctx, ctx->almost_ready_fence);
8514+             ctx->almost_ready_fence_pending = true;
8515+         } else {
8516+             ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8517+         }
84568518
84578519        if (use_fence) {
8458-             VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
8459- 
8460-             ctx->device->device.resetFences({ ctx->fence });
8520+             ggml_vk_wait_for_fence(ctx);
84618521        }
84628522#ifdef GGML_VULKAN_CHECK_RESULTS
84638523        ggml_vk_check_results_1(tensor);
@@ -8543,6 +8603,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
85438603    ctx->gc.events.clear();
85448604
85458605    ctx->device->device.destroyFence(ctx->fence);
8606+     ctx->device->device.destroyFence(ctx->almost_ready_fence);
85468607}
85478608
85488609static int ggml_vk_get_device_count() {
@@ -8889,8 +8950,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
88898950    }
88908951
88918952    ggml_vk_submit(transfer_ctx, ctx->fence);
8892-     VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
8893-     ctx->device->device.resetFences({ ctx->fence });
8953+     ggml_vk_wait_for_fence(ctx);
88948954
88958955    for (auto& cpy : transfer_ctx->out_memcpys) {
88968956        memcpy(cpy.dst, cpy.src, cpy.n);
@@ -8909,7 +8969,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
89098969
89108970    uint64_t total_mat_mul_bytes = 0;
89118971    for (int i = 0; i < cgraph->n_nodes; i++) {
8912-         ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
8972+         ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false );
89138973        if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
89148974            total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
89158975        }
@@ -8951,11 +9011,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
89519011            mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
89529012        }
89539013
9014+         // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
9015+         bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
89549016        bool submit = (submitted_nodes >= nodes_per_submit) ||
89559017                      (mul_mat_bytes >= mul_mat_bytes_per_submit) ||
8956-                       (i == last_node);
9018+                       (i == last_node) ||
9019+                       (almost_ready && !ctx->almost_ready_fence_pending);
89579020
8958-         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
9021+         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready,  submit);
89599022
89609023        if (enqueued) {
89619024            ++submitted_nodes;
@@ -8967,7 +9030,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
89679030#endif
89689031        }
89699032
8970-         if (submit) {
9033+         if (submit && enqueued ) {
89719034            first_node_in_batch = true;
89729035            submitted_nodes = 0;
89739036            mul_mat_bytes = 0;
0 commit comments