2424#include < future>
2525#include < thread>
2626
27+ #if defined(_MSC_VER)
28+ # define NOMINMAX 1
29+ # include < windows.h>
30+ # define YIELD () YieldProcessor()
31+ #elif defined(__clang__) || defined(__GNUC__)
32+ # if defined(__x86_64__) ||defined(__i386__)
33+ # include < immintrin.h>
34+ # define YIELD () _mm_pause()
35+ # elif defined(__arm__) || defined(__aarch64__)
36+ # if defined(__clang__)
37+ # include < arm_acle.h>
38+ # define YIELD () __yield()
39+ # else
40+ # define YIELD () asm volatile (" yield" )
41+ # endif
42+ # endif
43+ #endif
44+
45+ #if !defined(YIELD)
46+ #define YIELD ()
47+ #endif
48+
2749#include " ggml-impl.h"
2850#include " ggml-backend-impl.h"
2951
@@ -770,7 +792,8 @@ struct ggml_backend_vk_context {
770792 ggml_vk_garbage_collector gc;
771793 size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
772794 vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
773- vk::Fence fence;
795+ vk::Fence fence, almost_ready_fence;
796+ bool almost_ready_fence_pending {};
774797
775798 vk_buffer buffer_pool[MAX_VK_BUFFERS];
776799
@@ -861,6 +884,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
861884
862885static void ggml_backend_vk_free (ggml_backend_t backend);
863886
887+ // Wait for ctx->fence to be signaled.
888+ static void ggml_vk_wait_for_fence (ggml_backend_vk_context * ctx) {
889+ // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
890+ // during this wait.
891+ if (ctx->almost_ready_fence_pending ) {
892+ VK_CHECK (ctx->device ->device .waitForFences ({ ctx->almost_ready_fence }, true , UINT64_MAX), " almost_ready_fence" );
893+ ctx->device ->device .resetFences ({ ctx->almost_ready_fence });
894+ ctx->almost_ready_fence_pending = false ;
895+ }
896+
897+ // Spin (w/pause) waiting for the graph to finish executing.
898+ vk::Result result;
899+ while ((result = ctx->device ->device .getFenceStatus (ctx->fence )) != vk::Result::eSuccess) {
900+ if (result != vk::Result::eNotReady) {
901+ fprintf (stderr, " ggml_vulkan: error %s at %s:%d\n " , to_string (result).c_str (), __FILE__, __LINE__);
902+ exit (1 );
903+ }
904+ for (uint32_t i = 0 ; i < 100 ; ++i) {
905+ YIELD ();
906+ YIELD ();
907+ YIELD ();
908+ YIELD ();
909+ YIELD ();
910+ YIELD ();
911+ YIELD ();
912+ YIELD ();
913+ YIELD ();
914+ YIELD ();
915+ }
916+ }
917+ ctx->device ->device .resetFences ({ ctx->fence });
918+ }
919+
864920// variables to track number of compiles in progress
865921static uint32_t compile_count = 0 ;
866922static std::mutex compile_count_mutex;
@@ -3229,6 +3285,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
32293285 ctx->prealloc_size_split_k = 0 ;
32303286
32313287 ctx->fence = ctx->device ->device .createFence ({});
3288+ ctx->almost_ready_fence = ctx->device ->device .createFence ({});
32323289
32333290#ifdef GGML_VULKAN_CHECK_RESULTS
32343291 const char * skip_checks = getenv (" GGML_VULKAN_SKIP_CHECKS" );
@@ -7522,11 +7579,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
75227579 }
75237580}
75247581
7525- static bool ggml_vk_compute_forward (ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
7582+ static bool ggml_vk_compute_forward (ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready );
75267583
75277584// Returns true if node has enqueued work into the queue, false otherwise
75287585// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
7529- static bool ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
7586+ static bool ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
75307587 if (ggml_is_empty (node) || !node->buffer ) {
75317588 return false ;
75327589 }
@@ -7898,7 +7955,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
78987955
78997956 ctx->compute_ctx .reset ();
79007957
7901- bool ok = ggml_vk_compute_forward (ctx, node_begin, node_idx_begin, false );
7958+ bool ok = ggml_vk_compute_forward (ctx, node_begin, node_idx_begin, false , almost_ready );
79027959 if (!ok) {
79037960 if (node->op == GGML_OP_UNARY) {
79047961 std::cerr << __func__ << " : error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name (static_cast <ggml_unary_op>(node->op_params [0 ])) << " )" << std::endl;
@@ -7912,7 +7969,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
79127969 return true ;
79137970}
79147971
7915- static bool ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true ) {
7972+ static bool ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true , bool almost_ready = false ) {
79167973 ggml_backend_buffer * buf = nullptr ;
79177974
79187975 switch (tensor->op ) {
@@ -8015,12 +8072,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
80158072 memcpy (cpy.dst , cpy.src , cpy.n );
80168073 }
80178074
8018- ggml_vk_submit (subctx, use_fence ? ctx->fence : vk::Fence{});
8075+ if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
8076+ ggml_vk_submit (subctx, ctx->almost_ready_fence );
8077+ ctx->almost_ready_fence_pending = true ;
8078+ } else {
8079+ ggml_vk_submit (subctx, use_fence ? ctx->fence : vk::Fence{});
8080+ }
80198081
80208082 if (use_fence) {
8021- VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence }, true , UINT64_MAX), " ggml_vk_compute_forward waitForFences" );
8022-
8023- ctx->device ->device .resetFences ({ ctx->fence });
8083+ ggml_vk_wait_for_fence (ctx);
80248084 }
80258085#ifdef GGML_VULKAN_CHECK_RESULTS
80268086 ggml_vk_check_results_1 (tensor);
@@ -8106,6 +8166,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
81068166 ctx->gc .events .clear ();
81078167
81088168 ctx->device ->device .destroyFence (ctx->fence );
8169+ ctx->device ->device .destroyFence (ctx->almost_ready_fence );
81098170}
81108171
81118172static int ggml_vk_get_device_count () {
@@ -8452,8 +8513,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
84528513 }
84538514
84548515 ggml_vk_submit (transfer_ctx, ctx->fence );
8455- VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence }, true , UINT64_MAX), " ggml_backend_vk_synchronize waitForFences" );
8456- ctx->device ->device .resetFences ({ ctx->fence });
8516+ ggml_vk_wait_for_fence (ctx);
84578517
84588518 for (auto & cpy : transfer_ctx->out_memcpys ) {
84598519 memcpy (cpy.dst , cpy.src , cpy.n );
@@ -8472,7 +8532,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
84728532
84738533 uint64_t total_mat_mul_bytes = 0 ;
84748534 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
8475- ggml_vk_build_graph (ctx, cgraph->nodes [i], i, nullptr , 0 , true , false , false );
8535+ ggml_vk_build_graph (ctx, cgraph->nodes [i], i, nullptr , 0 , true , false , false , false );
84768536 if (cgraph->nodes [i]->op == GGML_OP_MUL_MAT || cgraph->nodes [i]->op == GGML_OP_MUL_MAT_ID) {
84778537 total_mat_mul_bytes += ggml_nbytes (cgraph->nodes [i]->src [0 ]);
84788538 }
@@ -8514,11 +8574,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
85148574 mul_mat_bytes += ggml_nbytes (cgraph->nodes [i]->src [0 ]);
85158575 }
85168576
8577+ // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
8578+ bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5 ;
85178579 bool submit = (submitted_nodes >= nodes_per_submit) ||
85188580 (mul_mat_bytes >= mul_mat_bytes_per_submit) ||
8519- (i == last_node);
8581+ (i == last_node) ||
8582+ (almost_ready && !ctx->almost_ready_fence_pending );
85208583
8521- bool enqueued = ggml_vk_build_graph (ctx, cgraph->nodes [i], i, cgraph->nodes [submit_node_idx], submit_node_idx, false , i == last_node, submit);
8584+ bool enqueued = ggml_vk_build_graph (ctx, cgraph->nodes [i], i, cgraph->nodes [submit_node_idx], submit_node_idx, false , i == last_node, almost_ready, submit);
85228585
85238586 if (enqueued) {
85248587 ++submitted_nodes;
@@ -8530,7 +8593,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
85308593#endif
85318594 }
85328595
8533- if (submit) {
8596+ if (submit && enqueued ) {
85348597 first_node_in_batch = true ;
85358598 submitted_nodes = 0 ;
85368599 mul_mat_bytes = 0 ;
0 commit comments