Skip to content

Commit 2ea90c3

Browse files
wqerrewetwam17anjeffbolznv
authored
up (#14)
* Ci (#11) (#12) * Fix cl (#7) * Rename build-amd.yml to build-amd.yml.disabled * Rename winget.yml to winget.yml.disabled * Rename server.yml to server.yml.disabled * Rename build.yml to build.yml.disabled * Update release.yml * Rename build-cmake-pkg.yml to build-cmake-pkg.yml.disabled * Rename build-linux-cross.yml to build-linux-cross.yml.disabled * Rename build-riscv-native.yml.disabled to build-riscv-native.yml * Rename docker.yml.disabled to docker.yml * Rename update-ops-docs.yml to update-ops-docs.yml.disabled * Remove macOS-arm64 job from release workflow Removed macOS-arm64 job and its associated steps from the release workflow. * CUDA: Fix bug in topk-moe for gpt-oss (ggml-org#16821) * CUDA: Fix bug in topk-moe for gpt-oss When using ggml_can_fuse_subgraph, the output nodes which are passed are wrong. This causes `test-backend-ops` to still fuse ndoes (because the nodes are not used elsewhere in the graph), but it actually doesn't fuse in the actual gpt-oss * fix for qwen3 too * change ifndef to ifdef * vulkan: Call ggml_vk_buffer_write_2d from ggml_vk_buffer_copy (ggml-org#16793) This lets the copy to the destination device use the host-visible vidmem optimization. --------- Co-authored-by: Aman Gupta <[email protected]> Co-authored-by: Jeff Bolz <[email protected]>
1 parent 677540c commit 2ea90c3

File tree

2 files changed

+14
-6
lines changed

2 files changed

+14
-6
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2978,7 +2978,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
29782978
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
29792979

29802980
if (ops.size() == topk_moe_ops_with_norm.size() &&
2981-
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
2981+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
29822982
ggml_tensor * softmax = cgraph->nodes[node_idx];
29832983
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
29842984

@@ -2997,7 +2997,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
29972997
}
29982998

29992999
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
3000-
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
3000+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
30013001
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
30023002
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
30033003

@@ -3118,9 +3118,20 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31183118
// With the use of CUDA graphs, the execution will be performed by the graph launch.
31193119
if (!use_cuda_graph || cuda_graph_update_required) {
31203120

3121+
[[maybe_unused]] int prev_i = 0;
3122+
31213123
for (int i = 0; i < cgraph->n_nodes; i++) {
31223124
ggml_tensor * node = cgraph->nodes[i];
31233125

3126+
3127+
#ifdef GGML_CUDA_DEBUG
3128+
const int nodes_fused = i - prev_i - 1;
3129+
prev_i = i;
3130+
if (nodes_fused > 0) {
3131+
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
3132+
}
3133+
#endif
3134+
31243135
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
31253136
continue;
31263137
}

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5652,14 +5652,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
56525652
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
56535653
// Copy device to device
56545654
ggml_vk_ensure_sync_staging_buffer(src->device, size);
5655-
ggml_vk_ensure_sync_staging_buffer(dst->device, size);
56565655

56575656
// Copy to src staging buffer
56585657
ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);
5659-
// memcpy to dst staging buffer
5660-
memcpy(dst->device->sync_staging->ptr, src->device->sync_staging->ptr, size);
56615658
// Copy to dst buffer
5662-
ggml_vk_buffer_copy(dst, dst_offset, dst->device->sync_staging, 0, size);
5659+
ggml_vk_buffer_write_2d(dst, dst_offset, src->device->sync_staging->ptr, 0, size, 1);
56635660
}
56645661
}
56655662

0 commit comments

Comments
 (0)