Skip to content

Commit 003f1c5

Browse files
committed
Revert "Merge commit '1ee9d0b415cdf5240418c110a18b419f4002b154' into concedo_experimental"
This reverts commit 2d22e61, reversing changes made to 2cee3b2.
1 parent 24c1ea9 commit 003f1c5

File tree

14 files changed

+499
-751
lines changed

14 files changed

+499
-751
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,13 @@ struct ggml_cuda_graph {
949949
bool disable_due_to_failed_graph_capture = false;
950950
int number_consecutive_updates = 0;
951951
std::vector<ggml_graph_node_properties> ggml_graph_properties;
952+
bool use_cpy_indirection = false;
953+
std::vector<char *> cpy_dest_ptrs;
954+
char ** dest_ptrs_d;
955+
int dest_ptrs_size = 0;
956+
// Index to allow each cpy kernel to be aware of it's position within the graph
957+
// relative to other cpy nodes.
958+
int graph_cpynode_index = -1;
952959
#endif
953960
};
954961

ggml/src/ggml-cuda/cpy.cu

Lines changed: 163 additions & 55 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cuda/cpy.cuh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
#define CUDA_CPY_BLOCK_SIZE 64
44

5-
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
5+
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false);
66

77
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
8+
9+
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
10+
11+
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2655,10 +2655,11 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
26552655
}
26562656

26572657
#ifdef USE_CUDA_GRAPH
2658-
static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
2658+
static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
26592659
bool use_cuda_graph) {
26602660

26612661
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
2662+
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
26622663

26632664
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
26642665
const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
@@ -2709,11 +2710,33 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
27092710
#endif
27102711
}
27112712

2713+
if (node->op == GGML_OP_CPY) {
2714+
2715+
// Store the pointers which are updated for each token, such that these can be sent
2716+
// to the device and accessed using indirection from CUDA graph
2717+
cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
2718+
2719+
// store a pointer to each copy op CUDA kernel to identify it later
2720+
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2721+
if (!ptr) {
2722+
use_cuda_graph = false;
2723+
#ifndef NDEBUG
2724+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
2725+
#endif
2726+
}
2727+
}
2728+
27122729
if (!use_cuda_graph) {
27132730
break;
27142731
}
27152732
}
27162733

2734+
if (use_cuda_graph) {
2735+
cuda_ctx->cuda_graph->use_cpy_indirection = true;
2736+
// copy pointers to GPU so they can be accessed via indirection within CUDA graph
2737+
ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
2738+
}
2739+
27172740
return use_cuda_graph;
27182741
}
27192742

@@ -2732,6 +2755,7 @@ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_p
27322755

27332756
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
27342757
if (node->data != graph_node_properties->node_address &&
2758+
node->op != GGML_OP_CPY &&
27352759
node->op != GGML_OP_VIEW) {
27362760
return false;
27372761
}
@@ -2752,6 +2776,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
27522776
for (int i = 0; i < GGML_MAX_SRC; i++) {
27532777
if (node->src[i] &&
27542778
node->src[i]->data != graph_node_properties->src_address[i] &&
2779+
node->op != GGML_OP_CPY &&
27552780
node->op != GGML_OP_VIEW
27562781
) {
27572782
return false;
@@ -3117,7 +3142,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
31173142
if (use_cuda_graph) {
31183143
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
31193144

3120-
use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
3145+
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
31213146

31223147
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
31233148
if (use_cuda_graph && cuda_graph_update_required) {
@@ -3144,6 +3169,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
31443169
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
31453170
}
31463171

3172+
if (!use_cuda_graph) {
3173+
cuda_ctx->cuda_graph->use_cpy_indirection = false;
3174+
}
3175+
31473176
#else
31483177
bool use_cuda_graph = false;
31493178
bool cuda_graph_update_required = false;

ggml/src/ggml-cuda/mmf.cu

Lines changed: 6 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "ggml.h"
22
#include "mmf.cuh"
3-
#include "mmid.cuh"
4-
53

64
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
75
GGML_ASSERT( src1->type == GGML_TYPE_F32);
@@ -39,12 +37,6 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
3937
const int64_t ids_s0 = ids ? ids->nb[0] / ggml_type_size(ids->type) : 0;
4038
const int64_t ids_s1 = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0;
4139

42-
mmf_ids_data ids_info{};
43-
mmf_ids_data * ids_info_ptr = nullptr;
44-
ggml_cuda_pool_alloc<int32_t> ids_src_compact_dev;
45-
ggml_cuda_pool_alloc<int32_t> ids_dst_compact_dev;
46-
ggml_cuda_pool_alloc<int32_t> expert_bounds_dev;
47-
4840
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
4941
const int64_t ncols_dst = ids ? ne2 : ne1;
5042
const int64_t nchannels_dst = ids ? ne1 : ne2;
@@ -62,57 +54,30 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
6254
nchannels_y = ids->ne[0];
6355
}
6456

65-
if (ids && ncols_dst > 16) {
66-
const int64_t n_expert_used = ids->ne[0];
67-
const int64_t n_experts = ne02;
68-
const int64_t n_tokens = ne12;
69-
const int64_t ne_get_rows = n_tokens * n_expert_used;
70-
71-
ids_src_compact_dev.alloc(ctx.pool(), ne_get_rows);
72-
ids_dst_compact_dev.alloc(ctx.pool(), ne_get_rows);
73-
expert_bounds_dev.alloc(ctx.pool(), n_experts + 1);
74-
75-
const int si1 = static_cast<int>(ids_s1);
76-
const int sis1 = static_cast<int>(src1->nb[2] / src1->nb[1]);
77-
78-
GGML_ASSERT(sis1 > 0);
79-
80-
ggml_cuda_launch_mm_ids_helper(ids_d, ids_src_compact_dev.get(), ids_dst_compact_dev.get(), expert_bounds_dev.get(),
81-
static_cast<int>(n_experts), static_cast<int>(n_tokens), static_cast<int>(n_expert_used), static_cast<int>(ne11), si1, sis1, ctx.stream());
82-
CUDA_CHECK(cudaGetLastError());
83-
84-
ids_info.ids_src_compact = ids_src_compact_dev.get();
85-
ids_info.ids_dst_compact = ids_dst_compact_dev.get();
86-
ids_info.expert_bounds_dev = expert_bounds_dev.get();
87-
ids_info.n_experts = static_cast<int>(n_experts);
88-
ids_info.sis1 = sis1;
89-
ids_info_ptr = &ids_info;
90-
}
91-
9257
switch (src0->type) {
9358
case GGML_TYPE_F32: {
9459
const float * src0_d = (const float *) src0->data;
9560
constexpr int vals_per_T = 1;
9661
mul_mat_f_switch_cols_per_block(
9762
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
9863
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
99-
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream(), ids_info_ptr);
64+
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
10065
} break;
10166
case GGML_TYPE_F16: {
10267
const half2 * src0_d = (const half2 *) src0->data;
10368
constexpr int vals_per_T = 2;
10469
mul_mat_f_switch_cols_per_block(
10570
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
10671
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
107-
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream(), ids_info_ptr);
72+
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
10873
} break;
10974
case GGML_TYPE_BF16: {
11075
const nv_bfloat162 * src0_d = (const nv_bfloat162 *) src0->data;
11176
constexpr int vals_per_T = 2;
11277
mul_mat_f_switch_cols_per_block(
11378
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
11479
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
115-
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream(), ids_info_ptr);
80+
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
11681
} break;
11782
default:
11883
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
@@ -133,9 +98,10 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
13398
}
13499

135100
if (mul_mat_id) {
136-
if (src0_ne[1] <= 1024 && src1_ncols > 512) {
101+
if (type == GGML_TYPE_F32 && src1_ncols > 32) {
137102
return false;
138-
} else if(src0_ne[1] > 1024 && src1_ncols > 128) {
103+
}
104+
if ((type == GGML_TYPE_F16 || type == GGML_TYPE_BF16) && src1_ncols > 64) {
139105
return false;
140106
}
141107
} else {

0 commit comments

Comments
 (0)