Skip to content

Commit 053b786

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents bde7b75 + e562eec commit 053b786

File tree

4 files changed

+17
-9
lines changed

4 files changed

+17
-9
lines changed

examples/parallel/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Simplified simulation of serving incoming requests in parallel
44

55
## Example
66

7-
Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question.
7+
Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of up to 10 junk questions (`--junk 10`) followed by the actual question.
88

99
```bash
1010
llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384

examples/parallel/parallel.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,10 @@ int main(int argc, char ** argv) {
315315
} else {
316316
client.prompt += k_system;
317317
}
318-
for (int i = 0; i < n_junk; ++i) {
318+
319+
const int n_junk_cur = rand() % n_junk;
320+
321+
for (int i = 0; i < n_junk_cur; ++i) {
319322
const int r = rand() % k_questions.size();
320323
client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
321324
}
@@ -340,7 +343,7 @@ int main(int argc, char ** argv) {
340343
client.n_decoded = 0;
341344
client.i_batch = batch.n_tokens - 1;
342345

343-
LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
346+
LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur);
344347

345348
g_seq_id += 1;
346349

ggml/src/ggml-backend.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
13401340
// allocate graph
13411341
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
13421342
// the re-allocation may cause the split inputs to be moved to a different address
1343-
ggml_backend_sched_synchronize(sched);
1343+
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
1344+
for (int i = 0; i < sched->n_backends; i++) {
1345+
ggml_backend_synchronize(sched->backends[i]);
1346+
}
13441347
#ifndef NDEBUG
13451348
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
13461349
#endif
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
15641567

15651568
ggml_backend_sched_split_graph(sched, graph);
15661569

1567-
15681570
if (!ggml_backend_sched_alloc_splits(sched)) {
15691571
return false;
15701572
}
@@ -1598,9 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
15981600
for (int i = 0; i < sched->n_backends; i++) {
15991601
ggml_backend_synchronize(sched->backends[i]);
16001602
}
1601-
// reset the current copy to 0 so that the graphs will be similar during generation
1602-
// necessary for CUDA graphs
1603-
sched->cur_copy = 0;
1603+
if (!sched->is_alloc) {
1604+
// if the graph is not already allocated, always use copy 0 after a synchronization
1605+
// this ensures that during generation the same copy is used every time,
1606+
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1607+
sched->cur_copy = 0;
1608+
}
16041609
}
16051610

16061611
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1246,7 +1246,7 @@ static __global__ void flash_attn_ext_f16(
12461246
NO_DEVICE_CODE;
12471247
return;
12481248
}
1249-
#endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING
1249+
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
12501250

12511251
static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");
12521252

0 commit comments

Comments
 (0)