@@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
787787
788788static void ggml_vk_submit (vk_context& ctx, vk::Fence fence) {
789789 if (ctx->seqs .empty ()) {
790+ if (fence) {
791+ ctx->q ->queue .submit ({}, fence);
792+ }
790793 return ;
791794 }
792795 VK_LOG_DEBUG (" ggml_vk_submit(" << ctx << " , " << fence << " )" );
@@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
56585661 }
56595662}
56605663
5661- static void ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
5664+ static bool ggml_vk_compute_forward (ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
5665+
5666+ // Returns true if node has enqueued work into the queue, false otherwise
5667+ // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
5668+ static bool ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
56625669 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra ;
56635670
56645671 if (ggml_is_empty (node) || extra == nullptr ) {
5665- return ;
5672+ return false ;
56665673 }
56675674
56685675 VK_LOG_DEBUG (" ggml_vk_build_graph(" << node << " , " << ggml_op_name (node->op ) << " )" );
@@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
56795686 case GGML_OP_PERMUTE:
56805687 case GGML_OP_TRANSPOSE:
56815688 case GGML_OP_NONE:
5682- return ;
5689+ return false ;
56835690 case GGML_OP_UNARY:
56845691 switch (ggml_get_unary_op (node)) {
56855692 case GGML_UNARY_OP_SILU:
@@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
56895696 case GGML_UNARY_OP_TANH:
56905697 break ;
56915698 default :
5692- return ;
5699+ return false ;
56935700 }
56945701 break ;
56955702 case GGML_OP_REPEAT:
@@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
57265733 default :
57275734 std::cerr << " ggml_vulkan: Error: Missing op: " << ggml_op_name (node->op ) << std::endl;
57285735 GGML_ABORT (" fatal error" );
5729- return ;
5736+ return false ;
57305737 }
57315738
57325739 vk_context compute_ctx;
@@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
58265833 ggml_vk_unary (ctx, compute_ctx, src0, node, dryrun);
58275834 break ;
58285835 default :
5829- return ;
5836+ return false ;
58305837 }
58315838 break ;
58325839 case GGML_OP_DIAG_MASK_INF:
@@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
58705877
58715878 break ;
58725879 default :
5873- return ;
5880+ return false ;
58745881 }
58755882
58765883 if (dryrun) {
5877- return ;
5884+ return false ;
58785885 }
58795886
58805887 ctx->tensor_ctxs [node_idx] = compute_ctx;
@@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
58855892 last_node = true ;
58865893#endif
58875894
5888- if (last_node) {
5895+ if (submit || last_node) {
58895896 ggml_vk_ctx_end (compute_ctx);
5890- compute_ctx->exit_tensor_idx = node_idx;
5897+
5898+ // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
5899+ if (last_node) {
5900+ compute_ctx->exit_tensor_idx = node_idx_begin;
5901+ }
5902+ else {
5903+ compute_ctx->exit_tensor_idx = -1 ;
5904+ }
5905+
58915906 ctx->compute_ctx .reset ();
5907+
5908+ bool ok = ggml_vk_compute_forward (ctx, node_begin, node_idx_begin, false );
5909+ if (!ok) {
5910+ if (node->op == GGML_OP_UNARY) {
5911+ std::cerr << __func__ << " : error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name (static_cast <ggml_unary_op>(node->op_params [0 ])) << " )" << std::endl;
5912+ }
5913+ else {
5914+ std::cerr << __func__ << " : error: op not supported " << node->name << " (" << ggml_op_name (node->op ) << " )" << std::endl;
5915+ }
5916+ }
5917+
58925918 }
5919+ return true ;
58935920}
58945921
5895- static bool ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
5922+ static bool ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true ){
58965923 ggml_tensor_extra_gpu * extra = nullptr ;
58975924
58985925 switch (tensor->op ) {
@@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
59605987
59615988 VK_LOG_DEBUG (" ggml_vk_compute_forward(" << tensor << " , name=" << tensor->name << " , op=" << ggml_op_name (tensor->op ) << " , type=" << tensor->type << " , ne0=" << tensor->ne [0 ] << " , ne1=" << tensor->ne [1 ] << " , ne2=" << tensor->ne [2 ] << " , ne3=" << tensor->ne [3 ] << " , nb0=" << tensor->nb [0 ] << " , nb1=" << tensor->nb [1 ] << " , nb2=" << tensor->nb [2 ] << " , nb3=" << tensor->nb [3 ] << " , view_src=" << tensor->view_src << " , view_offs=" << tensor->view_offs << " )" );
59625989
5963- #ifdef GGML_VULKAN_CHECK_RESULTS
5964- ggml_vk_check_results_0 (tensor);
5965- #endif
5966-
59675990 vk_context subctx = ctx->tensor_ctxs [tensor_idx].lock ();
59685991
5969- #ifdef GGML_VULKAN_PERF
5970- std::chrono::steady_clock::time_point start;
5971- #endif // GGML_VULKAN_PERF
5992+ // always wait for the GPU work to be done for the last submit
5993+ if (tensor_idx == subctx->exit_tensor_idx ) {
5994+ use_fence = true ;
5995+ }
59725996
59735997 // Only run if ctx hasn't been submitted yet
59745998 if (!subctx->seqs .empty ()) {
5999+ #ifdef GGML_VULKAN_CHECK_RESULTS
6000+ ggml_vk_check_results_0 (tensor);
6001+ use_fence = true ;
6002+ #endif
6003+
59756004 // Do staging buffer copies
59766005 for (auto & cpy : subctx->in_memcpys ) {
59776006 memcpy (cpy.dst , cpy.src , cpy.n );
59786007 }
59796008
5980- #ifdef GGML_VULKAN_PERF
5981- start = std::chrono::steady_clock::now ();
5982- #endif // GGML_VULKAN_PERF
6009+ ggml_vk_submit (subctx, use_fence ? ctx->fence : vk::Fence{});
6010+
6011+ if (use_fence) {
6012+ VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence }, true , UINT64_MAX), " ggml_vk_compute_forward waitForFences" );
59836013
5984- ggml_vk_submit (subctx, ctx->fence );
6014+ ctx->device ->device .resetFences ({ ctx->fence });
6015+ }
6016+ #ifdef GGML_VULKAN_CHECK_RESULTS
6017+ ggml_vk_check_results_1 (tensor);
6018+ #endif
59856019 }
59866020
59876021 if (tensor_idx == subctx->exit_tensor_idx ) {
5988- VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence }, true , UINT64_MAX), " ggml_vk_compute_forward waitForFences" );
5989-
5990- #ifdef GGML_VULKAN_PERF
5991- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now () - start);
5992- ctx->device ->perf_logger ->log_timing (tensor, duration.count ());
5993- #endif // GGML_VULKAN_PERF
5994-
5995- ctx->device ->device .resetFences ({ ctx->fence });
5996-
59976022 // Do staging buffer copies
59986023 for (auto & cpy : subctx->out_memcpys ) {
59996024 memcpy (cpy.dst , cpy.src , cpy.n );
@@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
64826507 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context ;
64836508
64846509 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
6485- ggml_vk_build_graph (ctx, cgraph->nodes [i], i, 0 , true );
6510+ ggml_vk_build_graph (ctx, cgraph->nodes [i], i, nullptr , 0 , true , false , false );
64866511 }
64876512 ggml_vk_preallocate_buffers (ctx);
64886513 ggml_pipeline_allocate_descriptor_sets (ctx->device );
@@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
64976522 // Reserve tensor context space for all nodes
64986523 ctx->tensor_ctxs .resize (cgraph->n_nodes );
64996524
6500- for (int i = 0 ; i < cgraph->n_nodes ; i++) {
6501- ggml_vk_build_graph (ctx, cgraph->nodes [i], i, i == last_node, false );
6502- }
6525+ bool first_node_in_batch = true ; // true if next node will be first node in a batch
6526+ int submit_node_idx = 0 ; // index to first node in a batch
65036527
6528+ // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
6529+ constexpr int submit_count = 100 ;
6530+ int submitted_nodes = 0 ;
65046531 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
6505- ggml_tensor * node = cgraph->nodes [i];
6506-
6507- if (ggml_vk_is_empty (node)) {
6508- continue ;
6532+ if (first_node_in_batch) {
6533+ submit_node_idx = i;
65096534 }
65106535
6511- bool ok = ggml_vk_compute_forward (ctx, node, i);
6512- if (!ok) {
6513- if (node->op == GGML_OP_UNARY) {
6514- std::cerr << __func__ << " : error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name (static_cast <ggml_unary_op>(node->op_params [0 ])) << " )" << std::endl;
6515- } else {
6516- std::cerr << __func__ << " : error: op not supported " << node->name << " (" << ggml_op_name (node->op ) << " )" << std::endl;
6536+ bool submit = (submitted_nodes >= submit_count) || (i == last_node);
6537+
6538+
6539+ bool enqueued = ggml_vk_build_graph (ctx, cgraph->nodes [i], i, cgraph->nodes [submit_node_idx], submit_node_idx, false , i == last_node, submit);
6540+
6541+ if (enqueued) {
6542+ ++submitted_nodes;
6543+
6544+ #ifndef GGML_VULKAN_CHECK_RESULTS
6545+ if (first_node_in_batch) {
6546+ first_node_in_batch = false ;
65176547 }
6548+ #endif
65186549 }
6519- #ifdef GGML_VULKAN_CHECK_RESULTS
6520- else {
6521- ggml_vk_check_results_1 (node);
6550+
6551+ if (submit) {
6552+ first_node_in_batch = true ;
6553+ submitted_nodes = 0 ;
65226554 }
6523- #endif
6524- GGML_ASSERT (ok);
65256555 }
65266556
65276557#ifdef GGML_VULKAN_PERF
0 commit comments