2323#include < mutex>
2424#include < future>
2525#include < thread>
26+ #include < immintrin.h>
2627
2728#include " ggml-impl.h"
2829#include " ggml-backend-impl.h"
@@ -770,7 +771,8 @@ struct ggml_backend_vk_context {
770771 ggml_vk_garbage_collector gc;
771772 size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
772773 vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
773- vk::Fence fence;
774+ vk::Fence fence, almost_ready_fence;
775+ bool almost_ready_fence_pending {};
774776
775777 vk_buffer buffer_pool[MAX_VK_BUFFERS];
776778
@@ -861,6 +863,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
861863
862864static void ggml_backend_vk_free (ggml_backend_t backend);
863865
866+ // Wait for ctx->fence to be signaled.
867+ static void ggml_vk_wait_for_fence (ggml_backend_vk_context * ctx) {
868+ // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
869+ // during this wait.
870+ if (ctx->almost_ready_fence_pending ) {
871+ VK_CHECK (ctx->device ->device .waitForFences ({ ctx->almost_ready_fence }, true , UINT64_MAX), " almost_ready_fence" );
872+ ctx->device ->device .resetFences ({ ctx->almost_ready_fence });
873+ ctx->almost_ready_fence_pending = false ;
874+ }
875+
876+ // Spin (w/pause) waiting for the graph to finish executing.
877+ vk::Result result;
878+ while ((result = ctx->device ->device .getFenceStatus (ctx->fence )) != vk::Result::eSuccess) {
879+ if (result != vk::Result::eNotReady) {
880+ fprintf (stderr, " ggml_vulkan: error %s at %s:%d\n " , to_string (result).c_str (), __FILE__, __LINE__);
881+ exit (1 );
882+ }
883+ for (uint32_t i = 0 ; i < 100 ; ++i) {
884+ _mm_pause ();
885+ _mm_pause ();
886+ _mm_pause ();
887+ _mm_pause ();
888+ _mm_pause ();
889+ _mm_pause ();
890+ _mm_pause ();
891+ _mm_pause ();
892+ _mm_pause ();
893+ _mm_pause ();
894+ }
895+ }
896+ ctx->device ->device .resetFences ({ ctx->fence });
897+ }
898+
864899// variables to track number of compiles in progress
865900static uint32_t compile_count = 0 ;
866901static std::mutex compile_count_mutex;
@@ -3229,6 +3264,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
32293264 ctx->prealloc_size_split_k = 0 ;
32303265
32313266 ctx->fence = ctx->device ->device .createFence ({});
3267+ ctx->almost_ready_fence = ctx->device ->device .createFence ({});
32323268
32333269#ifdef GGML_VULKAN_CHECK_RESULTS
32343270 const char * skip_checks = getenv (" GGML_VULKAN_SKIP_CHECKS" );
@@ -7522,11 +7558,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
75227558 }
75237559}
75247560
7525- static bool ggml_vk_compute_forward (ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
7561+ static bool ggml_vk_compute_forward (ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready );
75267562
75277563// Returns true if node has enqueued work into the queue, false otherwise
75287564// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
7529- static bool ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
7565+ static bool ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
75307566 if (ggml_is_empty (node) || !node->buffer ) {
75317567 return false ;
75327568 }
@@ -7898,7 +7934,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
78987934
78997935 ctx->compute_ctx .reset ();
79007936
7901- bool ok = ggml_vk_compute_forward (ctx, node_begin, node_idx_begin, false );
7937+ bool ok = ggml_vk_compute_forward (ctx, node_begin, node_idx_begin, false , almost_ready );
79027938 if (!ok) {
79037939 if (node->op == GGML_OP_UNARY) {
79047940 std::cerr << __func__ << " : error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name (static_cast <ggml_unary_op>(node->op_params [0 ])) << " )" << std::endl;
@@ -7912,7 +7948,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
79127948 return true ;
79137949}
79147950
7915- static bool ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true ) {
7951+ static bool ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true , bool almost_ready = false ) {
79167952 ggml_backend_buffer * buf = nullptr ;
79177953
79187954 switch (tensor->op ) {
@@ -8015,12 +8051,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
80158051 memcpy (cpy.dst , cpy.src , cpy.n );
80168052 }
80178053
8018- ggml_vk_submit (subctx, use_fence ? ctx->fence : vk::Fence{});
8054+ if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
8055+ ggml_vk_submit (subctx, ctx->almost_ready_fence );
8056+ ctx->almost_ready_fence_pending = true ;
8057+ } else {
8058+ ggml_vk_submit (subctx, use_fence ? ctx->fence : vk::Fence{});
8059+ }
80198060
80208061 if (use_fence) {
8021- VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence }, true , UINT64_MAX), " ggml_vk_compute_forward waitForFences" );
8022-
8023- ctx->device ->device .resetFences ({ ctx->fence });
8062+ ggml_vk_wait_for_fence (ctx);
80248063 }
80258064#ifdef GGML_VULKAN_CHECK_RESULTS
80268065 ggml_vk_check_results_1 (tensor);
@@ -8106,6 +8145,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
81068145 ctx->gc .events .clear ();
81078146
81088147 ctx->device ->device .destroyFence (ctx->fence );
8148+ ctx->device ->device .destroyFence (ctx->almost_ready_fence );
81098149}
81108150
81118151static int ggml_vk_get_device_count () {
@@ -8452,8 +8492,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
84528492 }
84538493
84548494 ggml_vk_submit (transfer_ctx, ctx->fence );
8455- VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence }, true , UINT64_MAX), " ggml_backend_vk_synchronize waitForFences" );
8456- ctx->device ->device .resetFences ({ ctx->fence });
8495+ ggml_vk_wait_for_fence (ctx);
84578496
84588497 for (auto & cpy : transfer_ctx->out_memcpys ) {
84598498 memcpy (cpy.dst , cpy.src , cpy.n );
@@ -8472,7 +8511,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
84728511
84738512 uint64_t total_mat_mul_bytes = 0 ;
84748513 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
8475- ggml_vk_build_graph (ctx, cgraph->nodes [i], i, nullptr , 0 , true , false , false );
8514+ ggml_vk_build_graph (ctx, cgraph->nodes [i], i, nullptr , 0 , true , false , false , false );
84768515 if (cgraph->nodes [i]->op == GGML_OP_MUL_MAT || cgraph->nodes [i]->op == GGML_OP_MUL_MAT_ID) {
84778516 total_mat_mul_bytes += ggml_nbytes (cgraph->nodes [i]->src [0 ]);
84788517 }
@@ -8514,11 +8553,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
85148553 mul_mat_bytes += ggml_nbytes (cgraph->nodes [i]->src [0 ]);
85158554 }
85168555
8556+ // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
8557+ bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5 ;
85178558 bool submit = (submitted_nodes >= nodes_per_submit) ||
85188559 (mul_mat_bytes >= mul_mat_bytes_per_submit) ||
8519- (i == last_node);
8560+ (i == last_node) ||
8561+ (almost_ready && !ctx->almost_ready_fence_pending );
85208562
8521- bool enqueued = ggml_vk_build_graph (ctx, cgraph->nodes [i], i, cgraph->nodes [submit_node_idx], submit_node_idx, false , i == last_node, submit);
8563+ bool enqueued = ggml_vk_build_graph (ctx, cgraph->nodes [i], i, cgraph->nodes [submit_node_idx], submit_node_idx, false , i == last_node, almost_ready, submit);
85228564
85238565 if (enqueued) {
85248566 ++submitted_nodes;
@@ -8530,7 +8572,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
85308572#endif
85318573 }
85328574
8533- if (submit) {
8575+ if (submit && enqueued ) {
85348576 first_node_in_batch = true ;
85358577 submitted_nodes = 0 ;
85368578 mul_mat_bytes = 0 ;
0 commit comments