@@ -5672,6 +5672,48 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
56725672        } else  {
56735673            compute_ctx = ctx->compute_ctx .lock ();
56745674        }
5675+     } else  {
5676+         switch  (node->op ) {
5677+         case  GGML_OP_REPEAT:
5678+         case  GGML_OP_ACC:
5679+         case  GGML_OP_GET_ROWS:
5680+         case  GGML_OP_ADD:
5681+         case  GGML_OP_MUL:
5682+         case  GGML_OP_DIV:
5683+         case  GGML_OP_CONCAT:
5684+         case  GGML_OP_UPSCALE:
5685+         case  GGML_OP_SCALE:
5686+         case  GGML_OP_SQR:
5687+         case  GGML_OP_SIN:
5688+         case  GGML_OP_COS:
5689+         case  GGML_OP_CLAMP:
5690+         case  GGML_OP_PAD:
5691+         case  GGML_OP_CPY:
5692+         case  GGML_OP_CONT:
5693+         case  GGML_OP_DUP:
5694+         case  GGML_OP_NORM:
5695+         case  GGML_OP_GROUP_NORM:
5696+         case  GGML_OP_RMS_NORM:
5697+         case  GGML_OP_UNARY:
5698+         case  GGML_OP_DIAG_MASK_INF:
5699+         case  GGML_OP_SOFT_MAX:
5700+         case  GGML_OP_ROPE:
5701+         case  GGML_OP_ARGSORT:
5702+         case  GGML_OP_SUM_ROWS:
5703+         case  GGML_OP_IM2COL:
5704+         case  GGML_OP_TIMESTEP_EMBEDDING:
5705+         case  GGML_OP_POOL_2D:
5706+         case  GGML_OP_LEAKY_RELU:
5707+             {
5708+                 //  These operations all go through ggml_vk_op_f32, so short-circuit and
5709+                 //  do the only thing needed for the dryrun.
5710+                 vk_pipeline pipeline = ggml_vk_op_get_pipeline (ctx, src0, src1, src2, node, node->op );
5711+                 ggml_pipeline_request_descriptor_sets (ctx->device , pipeline, 1 );
5712+                 return  false ;
5713+             }
5714+         default :
5715+             break ;
5716+         }
56755717    }
56765718
56775719    switch  (node->op ) {
@@ -6401,16 +6443,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
64016443    bool  first_node_in_batch = true ; //  true if next node will be first node in a batch
64026444    int  submit_node_idx = 0 ; //  index to first node in a batch
64036445
6404-     //  submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
6405-     constexpr  int  submit_count = 100 ;
6446+     //  Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
6447+     //  Start with a smaller count to get work submitted right away, and increase it after each submit.
6448+     int  nodes_per_submit = 20 ;
64066449    int  submitted_nodes = 0 ;
6450+     int  submit_count = 0 ;
64076451    for  (int  i = 0 ; i < cgraph->n_nodes ; i++) {
64086452        if  (first_node_in_batch) {
64096453            submit_node_idx = i;
64106454        }
64116455
6412-         bool  submit = (submitted_nodes >= submit_count) || (i == last_node);
6413- 
6456+         bool  submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
64146457
64156458        bool  enqueued = ggml_vk_build_graph (ctx, cgraph->nodes [i], i, cgraph->nodes [submit_node_idx], submit_node_idx, false , i == last_node, submit);
64166459
@@ -6427,6 +6470,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
64276470        if  (submit) {
64286471            first_node_in_batch = true ;
64296472            submitted_nodes = 0 ;
6473+             switch  (submit_count) {
6474+             case  0 :
6475+                 nodes_per_submit = 50 ;
6476+                 break ;
6477+             default :
6478+                 nodes_per_submit = 100 ;
6479+                 break ;
6480+             }
6481+             submit_count++;
64306482        }
64316483    }
64326484
0 commit comments