Merge branch 'parallel' into wip

koush · koush · commit 496666c81987 · 2025-06-02T22:50:09.000-07:00
diff --git a/ggml/src/ggml-tp/ggml-tp.cpp b/ggml/src/ggml-tp/ggml-tp.cpp
@@ -936,15 +936,21 @@ static ggml_tensor* ggml_backend_tp_node_compute_split(int device_index, ggml_te
 }
 
 static bool immediate_compute = true;
-static void ggml_backend_tp_buffer_compute_graph(ggml_cgraph * cgraph, std::function<bool(int, std::set<ggml_tensor*>)> gather_pending, std::function<bool(int, ggml_tensor *, ggml_tensor_parallel_extra *)> compute) {
+static void ggml_backend_tp_buffer_compute_graph(ggml_cgraph * cgraph, std::function<bool(int, std::set<ggml_tensor*>)> gather_pending, std::function<bool(int, ggml_tensor *, ggml_tensor_parallel_extra *)> compute, std::function<void(int, std::set<ggml_tensor*>)> flush_compute) {
     std::set<ggml_tensor*> pending_gathers;
     for (int node_index = 0; node_index < cgraph->n_nodes; node_index++) {
         auto tensor = cgraph->nodes[node_index];
         auto extra = (ggml_tensor_parallel_extra *)tensor->extra;
 
         // wait for async memcpy to finish if needed
-        if ((extra->needs_src_rejoin || immediate_compute) && pending_gathers.size()) {
-            if (gather_pending && !gather_pending(node_index, pending_gathers)) {
+        if (extra->needs_src_rejoin && pending_gathers.size()) {
+            if (!immediate_compute) {
+                if (flush_compute) {
+                    flush_compute(node_index, pending_gathers);
+                }
+            }
+
+            if (!gather_pending(node_index, pending_gathers)) {
                 return;
             }
             pending_gathers.clear();
@@ -957,6 +963,12 @@ static void ggml_backend_tp_buffer_compute_graph(ggml_cgraph * cgraph, std::func
         if (extra->has_rejoin) {
             pending_gathers.insert(tensor);
         }
+
+        if (immediate_compute) {
+            if (flush_compute) {
+                flush_compute(node_index, pending_gathers);
+            }
+        }
     }
 }
 
@@ -975,9 +987,13 @@ static void ggml_backend_tp_buffer_graph_compute_one(struct compute_thread * thr
     auto device_index = thread->device_index;
     auto be = ggml_parallel_backends[device_index];
 
+    if (!be->iface.cpy_tensor2d_async) {
+        GGML_ABORT("Backend %s does not support async tensor copy.\n", be->iface.get_name(be));
+    }
+
     int rejoins = 0;
 
-    auto flush_compute = [&](int node_index) {
+    auto flush_compute = [&](int node_index, std::set<ggml_tensor*> pending_gathers) {
         if (backend_graph->n_nodes ) {
             auto status = be->iface.graph_compute(be, backend_graph);
             if (status != GGML_STATUS_SUCCESS) {
@@ -986,19 +1002,11 @@ static void ggml_backend_tp_buffer_graph_compute_one(struct compute_thread * thr
             backend_graph->n_nodes = 0;
         }
         thread->end = node_index;
-    };
-
-    auto gather_pending = [&](int node_index, std::set<ggml_tensor*> pending_gathers) {
-        flush_compute(node_index);
 
         for (auto & tensor : pending_gathers) {
             auto extra = (ggml_tensor_parallel_extra *)tensor->extra;
             auto wrapped = extra->tensors[device_index];
 
-            if (!be->iface.cpy_tensor2d_async) {
-                GGML_ABORT("Backend %s does not support async tensor copy.\n", be->iface.get_name(be));
-            }
-
             // async copies
             for (size_t other_device_index = 0; other_device_index < ggml_parallel_devices.size(); other_device_index++) {
                 auto other_be = ggml_parallel_backends[other_device_index];
@@ -1018,7 +1026,9 @@ static void ggml_backend_tp_buffer_graph_compute_one(struct compute_thread * thr
                 }
             }
         }
+    };
 
+    auto gather_pending = [&](int node_index, std::set<ggml_tensor*> pending_gathers) {
         rejoins++;
         // synchronize self and then release peers
         ggml_backend_synchronize(be);
@@ -1043,16 +1053,11 @@ static void ggml_backend_tp_buffer_graph_compute_one(struct compute_thread * thr
         backend_graph->nodes[backend_graph->n_nodes++] = ggml_backend_tp_node_compute_split(device_index, tensor);
         extra->computed[device_index] = true;
 
-        if (immediate_compute) {
-            flush_compute(node_index);
-            ggml_backend_synchronize(be);
-        }
-
         return true;
     };
 
-    ggml_backend_tp_buffer_compute_graph(cgraph, gather_pending, compute);
-    flush_compute(cgraph->n_nodes);
+    ggml_backend_tp_buffer_compute_graph(cgraph, gather_pending, compute, flush_compute);
+    flush_compute(cgraph->n_nodes, std::set<ggml_tensor*>());
 
     thread->done.unlock();
 
@@ -1817,7 +1822,7 @@ static enum ggml_status ggml_backend_tp_graph_compute(ggml_backend_t backend, gg
     ggml_backend_tp_buffer_compute_graph(cgraph, nullptr, [&](int node_index, ggml_tensor * tensor, ggml_tensor_parallel_extra * extra) {
         do_init(tensor, extra);
         return true;
-    });
+    }, nullptr);
 
     // calculate the sizes needed for gathering the tensors.
     // this must happen on main thread to prevent race conditions on gather tensor setup.
@@ -1849,7 +1854,7 @@ static enum ggml_status ggml_backend_tp_graph_compute(ggml_backend_t backend, gg
             gather_buft_sizes_cur[device_index] += extra->gather_buft_sizes[device_index];
         }
         return true;
-    });
+    }, nullptr);
 
     // allocate the gather buffers
     for (size_t device_index = 0; device_index < ggml_parallel_devices.size(); device_index++) {
diff --git a/test.py b/test.py
@@ -0,0 +1,13 @@
+# pip install huggingface_hub hf_transfer
+import os # Optional for faster downloading
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
+name = "ubergarm/Qwen3-235B-A22B-GGUF"
+
+from huggingface_hub import snapshot_download
+snapshot_download(
+  repo_id = name,
+  local_dir = f"/mnt/scrypted-nvr/{name}",
+  allow_patterns = ["*IQ3_K*"], # Select quant type UD-IQ1_S for 1.58bit
+)
+