3333#include <future>
3434#include <thread>
3535
36+ #if defined(_MSC_VER)
37+ # define NOMINMAX 1
38+ # include <windows.h>
39+ # define YIELD() YieldProcessor()
40+ #elif defined(__clang__) || defined(__GNUC__)
41+ # if defined(__x86_64__) ||defined(__i386__)
42+ # include <immintrin.h>
43+ # define YIELD() _mm_pause()
44+ # elif defined(__arm__) || defined(__aarch64__)
45+ # if defined(__clang__)
46+ # include <arm_acle.h>
47+ # define YIELD() __yield()
48+ # else
49+ # define YIELD() asm volatile("yield")
50+ # endif
51+ # endif
52+ #endif
53+
54+ #if !defined(YIELD)
55+ #define YIELD()
56+ #endif
57+
3658#include "ggml-impl.h"
3759#include "ggml-backend-impl.h"
3860
@@ -796,7 +818,8 @@ struct ggml_backend_vk_context {
796818 ggml_vk_garbage_collector gc;
797819 size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
798820 vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
799- vk::Fence fence;
821+ vk::Fence fence, almost_ready_fence;
822+ bool almost_ready_fence_pending {};
800823
801824 vk_buffer buffer_pool[MAX_VK_BUFFERS];
802825
@@ -887,6 +910,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
887910
888911static void ggml_backend_vk_free(ggml_backend_t backend);
889912
913+ // Wait for ctx->fence to be signaled.
914+ static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
915+ // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
916+ // during this wait.
917+ if (ctx->almost_ready_fence_pending) {
918+ VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence");
919+ ctx->device->device.resetFences({ ctx->almost_ready_fence });
920+ ctx->almost_ready_fence_pending = false;
921+ }
922+
923+ // Spin (w/pause) waiting for the graph to finish executing.
924+ vk::Result result;
925+ while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) {
926+ if (result != vk::Result::eNotReady) {
927+ fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__);
928+ exit(1);
929+ }
930+ for (uint32_t i = 0; i < 100; ++i) {
931+ YIELD();
932+ YIELD();
933+ YIELD();
934+ YIELD();
935+ YIELD();
936+ YIELD();
937+ YIELD();
938+ YIELD();
939+ YIELD();
940+ YIELD();
941+ }
942+ }
943+ ctx->device->device.resetFences({ ctx->fence });
944+ }
945+
890946// variables to track number of compiles in progress
891947static uint32_t compile_count = 0;
892948static std::mutex compile_count_mutex;
@@ -3372,6 +3428,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
33723428 ctx->prealloc_size_split_k = 0;
33733429
33743430 ctx->fence = ctx->device->device.createFence({});
3431+ ctx->almost_ready_fence = ctx->device->device.createFence({});
33753432
33763433#ifdef GGML_VULKAN_CHECK_RESULTS
33773434 const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
@@ -7976,11 +8033,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
79768033 }
79778034}
79788035
7979- static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
8036+ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready );
79808037
79818038// Returns true if node has enqueued work into the queue, false otherwise
79828039// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
7983- static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
8040+ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
79848041 if (ggml_is_empty(node) || !node->buffer) {
79858042 return false;
79868043 }
@@ -8352,7 +8409,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
83528409
83538410 ctx->compute_ctx.reset();
83548411
8355- bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
8412+ bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready );
83568413 if (!ok) {
83578414 if (node->op == GGML_OP_UNARY) {
83588415 std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
@@ -8366,7 +8423,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
83668423 return true;
83678424}
83688425
8369- static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true) {
8426+ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) {
83708427 ggml_backend_buffer * buf = nullptr;
83718428
83728429 switch (tensor->op) {
@@ -8469,12 +8526,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
84698526 memcpy(cpy.dst, cpy.src, cpy.n);
84708527 }
84718528
8472- ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8529+ if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
8530+ ggml_vk_submit(subctx, ctx->almost_ready_fence);
8531+ ctx->almost_ready_fence_pending = true;
8532+ } else {
8533+ ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8534+ }
84738535
84748536 if (use_fence) {
8475- VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
8476-
8477- ctx->device->device.resetFences({ ctx->fence });
8537+ ggml_vk_wait_for_fence(ctx);
84788538 }
84798539#ifdef GGML_VULKAN_CHECK_RESULTS
84808540 ggml_vk_check_results_1(tensor);
@@ -8560,6 +8620,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
85608620 ctx->gc.events.clear();
85618621
85628622 ctx->device->device.destroyFence(ctx->fence);
8623+ ctx->device->device.destroyFence(ctx->almost_ready_fence);
85638624}
85648625
85658626static int ggml_vk_get_device_count() {
@@ -8906,8 +8967,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
89068967 }
89078968
89088969 ggml_vk_submit(transfer_ctx, ctx->fence);
8909- VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
8910- ctx->device->device.resetFences({ ctx->fence });
8970+ ggml_vk_wait_for_fence(ctx);
89118971
89128972 for (auto& cpy : transfer_ctx->out_memcpys) {
89138973 memcpy(cpy.dst, cpy.src, cpy.n);
@@ -8926,7 +8986,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
89268986
89278987 uint64_t total_mat_mul_bytes = 0;
89288988 for (int i = 0; i < cgraph->n_nodes; i++) {
8929- ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
8989+ ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false );
89308990 if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
89318991 total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
89328992 }
@@ -8968,11 +9028,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
89689028 mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
89699029 }
89709030
9031+ // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
9032+ bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
89719033 bool submit = (submitted_nodes >= nodes_per_submit) ||
89729034 (mul_mat_bytes >= mul_mat_bytes_per_submit) ||
8973- (i == last_node);
9035+ (i == last_node) ||
9036+ (almost_ready && !ctx->almost_ready_fence_pending);
89749037
8975- bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
9038+ bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
89769039
89779040 if (enqueued) {
89789041 ++submitted_nodes;
@@ -8984,7 +9047,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
89849047#endif
89859048 }
89869049
8987- if (submit) {
9050+ if (submit && enqueued ) {
89889051 first_node_in_batch = true;
89899052 submitted_nodes = 0;
89909053 mul_mat_bytes = 0;
0 commit comments