Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 78 additions & 15 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,28 @@
#include <future>
#include <thread>

#if defined(_MSC_VER)
# define NOMINMAX 1
# include <windows.h>
# define YIELD() YieldProcessor()
#elif defined(__clang__) || defined(__GNUC__)
# if defined(__x86_64__) ||defined(__i386__)
# include <immintrin.h>
# define YIELD() _mm_pause()
# elif defined(__arm__) || defined(__aarch64__)
# if defined(__clang__)
# include <arm_acle.h>
# define YIELD() __yield()
# else
# define YIELD() asm volatile("yield")
# endif
# endif
#endif

#if !defined(YIELD)
#define YIELD()
#endif

#include "ggml-impl.h"
#include "ggml-backend-impl.h"

Expand Down Expand Up @@ -770,7 +792,8 @@ struct ggml_backend_vk_context {
ggml_vk_garbage_collector gc;
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
vk::Fence fence;
vk::Fence fence, almost_ready_fence;
bool almost_ready_fence_pending {};

vk_buffer buffer_pool[MAX_VK_BUFFERS];

Expand Down Expand Up @@ -861,6 +884,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx

static void ggml_backend_vk_free(ggml_backend_t backend);

// Wait for ctx->fence to be signaled.
static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
// Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
// during this wait.
if (ctx->almost_ready_fence_pending) {
VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence");
ctx->device->device.resetFences({ ctx->almost_ready_fence });
ctx->almost_ready_fence_pending = false;
}

// Spin (w/pause) waiting for the graph to finish executing.
vk::Result result;
while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) {
if (result != vk::Result::eNotReady) {
fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__);
exit(1);
}
for (uint32_t i = 0; i < 100; ++i) {
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
}
}
ctx->device->device.resetFences({ ctx->fence });
}

// variables to track number of compiles in progress
static uint32_t compile_count = 0;
static std::mutex compile_count_mutex;
Expand Down Expand Up @@ -3229,6 +3285,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
ctx->prealloc_size_split_k = 0;

ctx->fence = ctx->device->device.createFence({});
ctx->almost_ready_fence = ctx->device->device.createFence({});

#ifdef GGML_VULKAN_CHECK_RESULTS
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
Expand Down Expand Up @@ -7522,11 +7579,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
}
}

static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready);

// Returns true if node has enqueued work into the queue, false otherwise
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
if (ggml_is_empty(node) || !node->buffer) {
return false;
}
Expand Down Expand Up @@ -7898,7 +7955,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod

ctx->compute_ctx.reset();

bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready);
if (!ok) {
if (node->op == GGML_OP_UNARY) {
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
Expand All @@ -7912,7 +7969,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
return true;
}

static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) {
ggml_backend_buffer * buf = nullptr;

switch (tensor->op) {
Expand Down Expand Up @@ -8015,12 +8072,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(cpy.dst, cpy.src, cpy.n);
}

ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
ggml_vk_submit(subctx, ctx->almost_ready_fence);
ctx->almost_ready_fence_pending = true;
} else {
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
}

if (use_fence) {
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");

ctx->device->device.resetFences({ ctx->fence });
ggml_vk_wait_for_fence(ctx);
}
#ifdef GGML_VULKAN_CHECK_RESULTS
ggml_vk_check_results_1(tensor);
Expand Down Expand Up @@ -8106,6 +8166,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
ctx->gc.events.clear();

ctx->device->device.destroyFence(ctx->fence);
ctx->device->device.destroyFence(ctx->almost_ready_fence);
}

static int ggml_vk_get_device_count() {
Expand Down Expand Up @@ -8452,8 +8513,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
}

ggml_vk_submit(transfer_ctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
ctx->device->device.resetFences({ ctx->fence });
ggml_vk_wait_for_fence(ctx);

for (auto& cpy : transfer_ctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
Expand All @@ -8472,7 +8532,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg

uint64_t total_mat_mul_bytes = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false);
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
}
Expand Down Expand Up @@ -8514,11 +8574,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
}

// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
bool submit = (submitted_nodes >= nodes_per_submit) ||
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
(i == last_node);
(i == last_node) ||
(almost_ready && !ctx->almost_ready_fence_pending);

bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);

if (enqueued) {
++submitted_nodes;
Expand All @@ -8530,7 +8593,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
#endif
}

if (submit) {
if (submit && enqueued) {
first_node_in_batch = true;
submitted_nodes = 0;
mul_mat_bytes = 0;
Expand Down
Loading