Skip to content

Commit 2d34163

Browse files
committed
vulkan: get the first command buffer submitted sooner
This is an incremental improvement over ggml-org#9118 to get work to the GPU a bit sooner. The first part is to start with a smaller number of nodes before the first submit, and ramp it up to the current 100 nodes/submit. The second part is to reduce the dryrun overhead for all the nodes that just need to request descriptor space. With these changes I get around 1-2% speedup on RTX 4070 combined with my old Haswell-era CPU.
1 parent 9abe9ee commit 2d34163

File tree

1 file changed

+56
-4
lines changed

1 file changed

+56
-4
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5630,6 +5630,48 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
56305630
} else {
56315631
compute_ctx = ctx->compute_ctx.lock();
56325632
}
5633+
} else {
5634+
switch (node->op) {
5635+
case GGML_OP_REPEAT:
5636+
case GGML_OP_ACC:
5637+
case GGML_OP_GET_ROWS:
5638+
case GGML_OP_ADD:
5639+
case GGML_OP_MUL:
5640+
case GGML_OP_DIV:
5641+
case GGML_OP_CONCAT:
5642+
case GGML_OP_UPSCALE:
5643+
case GGML_OP_SCALE:
5644+
case GGML_OP_SQR:
5645+
case GGML_OP_SIN:
5646+
case GGML_OP_COS:
5647+
case GGML_OP_CLAMP:
5648+
case GGML_OP_PAD:
5649+
case GGML_OP_CPY:
5650+
case GGML_OP_CONT:
5651+
case GGML_OP_DUP:
5652+
case GGML_OP_NORM:
5653+
case GGML_OP_GROUP_NORM:
5654+
case GGML_OP_RMS_NORM:
5655+
case GGML_OP_UNARY:
5656+
case GGML_OP_DIAG_MASK_INF:
5657+
case GGML_OP_SOFT_MAX:
5658+
case GGML_OP_ROPE:
5659+
case GGML_OP_ARGSORT:
5660+
case GGML_OP_SUM_ROWS:
5661+
case GGML_OP_IM2COL:
5662+
case GGML_OP_TIMESTEP_EMBEDDING:
5663+
case GGML_OP_POOL_2D:
5664+
case GGML_OP_LEAKY_RELU:
5665+
{
5666+
// These operations all go through ggml_vk_op_f32, so short-circuit and
5667+
// do the only thing needed for the dryrun.
5668+
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
5669+
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
5670+
return false;
5671+
}
5672+
default:
5673+
break;
5674+
}
56335675
}
56345676

56355677
switch (node->op) {
@@ -6359,16 +6401,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
63596401
bool first_node_in_batch = true; // true if next node will be first node in a batch
63606402
int submit_node_idx = 0; // index to first node in a batch
63616403

6362-
// submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
6363-
constexpr int submit_count = 100;
6404+
// Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
6405+
// Start with a smaller count to get work submitted right away, and increase it after each submit.
6406+
int nodes_per_submit = 20;
63646407
int submitted_nodes = 0;
6408+
int submit_count = 0;
63656409
for (int i = 0; i < cgraph->n_nodes; i++) {
63666410
if (first_node_in_batch) {
63676411
submit_node_idx = i;
63686412
}
63696413

6370-
bool submit = (submitted_nodes >= submit_count) || (i == last_node);
6371-
6414+
bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
63726415

63736416
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
63746417

@@ -6385,6 +6428,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
63856428
if (submit) {
63866429
first_node_in_batch = true;
63876430
submitted_nodes = 0;
6431+
switch (submit_count) {
6432+
case 0:
6433+
nodes_per_submit = 50;
6434+
break;
6435+
default:
6436+
nodes_per_submit = 100;
6437+
break;
6438+
}
6439+
submit_count++;
63886440
}
63896441
}
63906442

0 commit comments

Comments
 (0)