Skip to content

Commit c065fc5

Browse files
committed
vulkan: Submit once enough matmul work has been recorded
I've been seeing significantly worse performance for tg with flash attention enabled vs disabled, and it seems to be related to the submit heuristic. Change the heuristic to check how many bytes worth of weight matrix are used and flush every 100MB. This seems to resolve the issue, and also increases perf for non-FA a bit.
1 parent f4c3dd5 commit c065fc5

File tree

1 file changed

+14
-12
lines changed

1 file changed

+14
-12
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8267,17 +8267,26 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
82678267
bool first_node_in_batch = true; // true if next node will be first node in a batch
82688268
int submit_node_idx = 0; // index to first node in a batch
82698269

8270-
// Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
8271-
// Start with a smaller count to get work submitted right away, and increase it after each submit.
8272-
int nodes_per_submit = 20;
8270+
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
8271+
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB.
8272+
// Also submit at least every 100 nodes, in case there are workloads without as much matmul.
8273+
int nodes_per_submit = 100;
82738274
int submitted_nodes = 0;
82748275
int submit_count = 0;
8276+
uint64_t mul_mat_bytes = 0;
8277+
uint64_t mul_mat_bytes_per_submit = 100*1000*1000;
82758278
for (int i = 0; i < cgraph->n_nodes; i++) {
82768279
if (first_node_in_batch) {
82778280
submit_node_idx = i;
82788281
}
82798282

8280-
bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
8283+
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
8284+
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
8285+
}
8286+
8287+
bool submit = (submitted_nodes >= nodes_per_submit) ||
8288+
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
8289+
(i == last_node);
82818290

82828291
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
82838292

@@ -8294,14 +8303,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
82948303
if (submit) {
82958304
first_node_in_batch = true;
82968305
submitted_nodes = 0;
8297-
switch (submit_count) {
8298-
case 0:
8299-
nodes_per_submit = 50;
8300-
break;
8301-
default:
8302-
nodes_per_submit = 100;
8303-
break;
8304-
}
8306+
mul_mat_bytes = 0;
83058307
submit_count++;
83068308
}
83078309
}

0 commit comments

Comments
 (0)