Merge branch 'ggml-org:master' into mradermacher

nicoboss · web-flow · commit 053b786916e1 · 2025-05-31T01:24:19.000+02:00
diff --git a/examples/parallel/README.md b/examples/parallel/README.md
@@ -4,7 +4,7 @@ Simplified simulation of serving incoming requests in parallel
 
 ## Example
 
-Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question.
+Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of up to 10 junk questions (`--junk 10`) followed by the actual question.
 
 ```bash
 llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -315,7 +315,10 @@ int main(int argc, char ** argv) {
                     } else {
                         client.prompt += k_system;
                     }
-                    for (int i = 0; i < n_junk; ++i) {
+
+                    const int n_junk_cur = rand() % n_junk;
+
+                    for (int i = 0; i < n_junk_cur; ++i) {
                         const int r = rand() % k_questions.size();
                         client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
                     }
@@ -340,7 +343,7 @@ int main(int argc, char ** argv) {
                     client.n_decoded = 0;
                     client.i_batch   = batch.n_tokens - 1;
 
-                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur);
 
                     g_seq_id += 1;
 
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
     // allocate graph
     if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
         // the re-allocation may cause the split inputs to be moved to a different address
-        ggml_backend_sched_synchronize(sched);
+        // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
+        for (int i = 0; i < sched->n_backends; i++) {
+            ggml_backend_synchronize(sched->backends[i]);
+        }
 #ifndef NDEBUG
         GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
 
     ggml_backend_sched_split_graph(sched, graph);
 
-
     if (!ggml_backend_sched_alloc_splits(sched)) {
         return false;
     }
@@ -1598,9 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
     for (int i = 0; i < sched->n_backends; i++) {
         ggml_backend_synchronize(sched->backends[i]);
     }
-    // reset the current copy to 0 so that the graphs will be similar during generation
-    // necessary for CUDA graphs
-    sched->cur_copy = 0;
+    if (!sched->is_alloc) {
+        // if the graph is not already allocated, always use copy 0 after a synchronization
+        // this ensures that during generation the same copy is used every time,
+        // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
+        sched->cur_copy = 0;
+    }
 }
 
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1246,7 +1246,7 @@ static __global__ void flash_attn_ext_f16(
         NO_DEVICE_CODE;
         return;
     }
-#endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
 
     static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");
 

Original file line number	Diff line number	Diff line change
`@@ -1246,7 +1246,7 @@ static __global__ void flash_attn_ext_f16(`
`1246`	`1246`	`NO_DEVICE_CODE;`
`1247`	`1247`	`return;`
`1248`	`1248`	`}`
`1249`		`-#endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING`
	`1249`	`+#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING`
`1250`	`1250`
`1251`	`1251`	`static_assert(!mla \|\| DKQ >= DV, "MLA needs DKQ >= DV");`
`1252`	`1252`