up (#14)

wqerrewetw · am17an · jeffbolznv · web-flow · commit 2ea90c3cd81d · 2025-10-29T20:01:29.000+08:00
* Ci (#11) (#12) * Fix cl (#7) * Rename build-amd.yml to build-amd.yml.disabled * Rename winget.yml to winget.yml.disabled * Rename server.yml to server.yml.disabled * Rename build.yml to build.yml.disabled * Update release.yml * Rename build-cmake-pkg.yml to build-cmake-pkg.yml.disabled * Rename build-linux-cross.yml to build-linux-cross.yml.disabled * Rename build-riscv-native.yml.disabled to build-riscv-native.yml * Rename docker.yml.disabled to docker.yml * Rename update-ops-docs.yml to update-ops-docs.yml.disabled * Remove macOS-arm64 job from release workflow Removed macOS-arm64 job and its associated steps from the release workflow. * CUDA: Fix bug in topk-moe for gpt-oss (ggml-org#16821) * CUDA: Fix bug in topk-moe for gpt-oss When using ggml_can_fuse_subgraph, the output nodes which are passed are wrong. This causes `test-backend-ops` to still fuse ndoes (because the nodes are not used elsewhere in the graph), but it actually doesn't fuse in the actual gpt-oss * fix for qwen3 too * change ifndef to ifdef * vulkan: Call ggml_vk_buffer_write_2d from ggml_vk_buffer_copy (ggml-org#16793) This lets the copy to the destination device use the host-visible vidmem optimization. --------- Co-authored-by: Aman Gupta <amangupta052@gmail.com> Co-authored-by: Jeff Bolz <jbolz@nvidia.com>
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2978,7 +2978,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
         ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
 
     if (ops.size() == topk_moe_ops_with_norm.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
         ggml_tensor * softmax = cgraph->nodes[node_idx];
         ggml_tensor * weights = cgraph->nodes[node_idx + 9];
 
@@ -2997,7 +2997,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
     }
 
     if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
         ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
         ggml_tensor * weights = cgraph->nodes[node_idx + 5];
 
@@ -3118,9 +3118,20 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
         // With the use of CUDA graphs, the execution will be performed by the graph launch.
         if (!use_cuda_graph || cuda_graph_update_required) {
 
+            [[maybe_unused]] int prev_i = 0;
+
             for (int i = 0; i < cgraph->n_nodes; i++) {
                 ggml_tensor * node = cgraph->nodes[i];
 
+
+#ifdef GGML_CUDA_DEBUG
+                const int nodes_fused = i - prev_i - 1;
+                prev_i = i;
+                if (nodes_fused > 0) {
+                    GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
+                }
+#endif
+
                 if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
                     continue;
                 }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5652,14 +5652,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
         VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
         // Copy device to device
         ggml_vk_ensure_sync_staging_buffer(src->device, size);
-        ggml_vk_ensure_sync_staging_buffer(dst->device, size);
 
         // Copy to src staging buffer
         ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);
-        // memcpy to dst staging buffer
-        memcpy(dst->device->sync_staging->ptr, src->device->sync_staging->ptr, size);
         // Copy to dst buffer
-        ggml_vk_buffer_copy(dst, dst_offset, dst->device->sync_staging, 0, size);
+        ggml_vk_buffer_write_2d(dst, dst_offset, src->device->sync_staging->ptr, 0, size, 1);
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -5652,14 +5652,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr`
`5652`	`5652`	`VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");`
`5653`	`5653`	`// Copy device to device`
`5654`	`5654`	`ggml_vk_ensure_sync_staging_buffer(src->device, size);`
`5655`		`- ggml_vk_ensure_sync_staging_buffer(dst->device, size);`
`5656`	`5655`
`5657`	`5656`	`// Copy to src staging buffer`
`5658`	`5657`	`ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);`
`5659`		`- // memcpy to dst staging buffer`
`5660`		`- memcpy(dst->device->sync_staging->ptr, src->device->sync_staging->ptr, size);`
`5661`	`5658`	`// Copy to dst buffer`
`5662`		`- ggml_vk_buffer_copy(dst, dst_offset, dst->device->sync_staging, 0, size);`
	`5659`	`+ ggml_vk_buffer_write_2d(dst, dst_offset, src->device->sync_staging->ptr, 0, size, 1);`
`5663`	`5660`	`}`
`5664`	`5661`	`}`
`5665`	`5662`