@@ -594,6 +594,7 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
594
594
595
595
llm_graph_context::llm_graph_context (const llm_graph_params & params) :
596
596
model (params.model),
597
+ arch (model.arch),
597
598
hparams (model.hparams),
598
599
cparams (params.cparams),
599
600
ubatch (params.ubatch),
@@ -633,13 +634,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
633
634
res (std::make_unique<llm_graph_result>()) {
634
635
}
635
636
636
- // TODO: deduplicate with llama_context::graph_max_nodes()
637
- int32_t llm_graph_context::graph_max_nodes () const {
638
- return std::max<int32_t >(8192 , 5 *model.n_tensors ());
639
- }
640
-
641
637
int64_t llm_graph_context::n_pos_per_token () const {
642
- return model. arch == LLM_ARCH_QWEN2VL ? 4 : 1 ;
638
+ return arch == LLM_ARCH_QWEN2VL ? 4 : 1 ;
643
639
}
644
640
645
641
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
@@ -1251,8 +1247,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1251
1247
1252
1248
// TODO: replace hardcoded padding with ggml-provided padding
1253
1249
if (cparams.flash_attn && (n_kv % 256 == 0 ) && kq_b == nullptr ) {
1254
- GGML_UNUSED (model);
1255
-
1256
1250
GGML_ASSERT (kq_b == nullptr && " Flash attention does not support KQ bias yet" );
1257
1251
1258
1252
if (v_trans) {
@@ -1272,7 +1266,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1272
1266
// while for some models F16 is enough, for others it is not, so we default to F32 here
1273
1267
ggml_mul_mat_set_prec (kq, GGML_PREC_F32);
1274
1268
1275
- if (model. arch == LLM_ARCH_GROK) {
1269
+ if (arch == LLM_ARCH_GROK) {
1276
1270
// need to do the following:
1277
1271
// multiply by attn_output_multiplyer of 0.08838834764831845
1278
1272
// and then :
@@ -1483,7 +1477,7 @@ ggml_tensor * llm_graph_context::build_attn(
1483
1477
// TODO: improve
1484
1478
bool is_sliding = false ;
1485
1479
1486
- switch (model. arch ) {
1480
+ switch (arch) {
1487
1481
case LLM_ARCH_COHERE2:
1488
1482
{
1489
1483
const int32_t sliding_window_pattern = 4 ;
@@ -2110,140 +2104,9 @@ void llm_graph_context::build_kv_self_shift(ggml_cgraph * gf) const {
2110
2104
}
2111
2105
2112
2106
void llm_graph_context::build_kv_self_defrag (ggml_cgraph * gf) const {
2113
- const llama_kv_cache_unified * kv_self_const = static_cast <const llama_kv_cache_unified *>(memory);
2114
-
2115
- // TODO: avoid this
2116
- llama_kv_cache_unified * kv_self = const_cast <llama_kv_cache_unified *>(kv_self_const);
2117
-
2118
- const uint32_t n_layer = hparams.n_layer ;
2119
-
2120
- const uint32_t n_kv = kv_self->cell_max ();
2121
- const uint32_t n_used = kv_self->used ;
2122
-
2123
- assert (n_used <= n_kv);
2124
-
2125
- // const int64_t t_start = ggml_time_us();
2126
-
2127
- // number of cells moved
2128
- uint32_t n_moves = 0 ;
2129
-
2130
- // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
2131
- // - source view, destination view, copy operation
2132
- // - x2 for keys and values
2133
- // const uint32_t max_moves = max_nodes()/(6*n_layer);
2134
- // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
2135
- const uint32_t max_moves = (graph_max_nodes () - 2 *n_layer)/(6 *n_layer);
2136
-
2137
- // determine which KV cells to move where
2138
- //
2139
- // cell i moves to ids[i]
2140
- //
2141
- // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
2142
- //
2143
- std::vector<uint32_t > ids (n_kv, n_kv);
2144
-
2145
- for (uint32_t i0 = 0 ; i0 < n_used; ++i0) {
2146
- const auto & cell0 = kv_self->cells [i0];
2147
-
2148
- if (!cell0.is_empty ()) {
2149
- ids[i0] = i0;
2150
-
2151
- continue ;
2152
- }
2153
-
2154
- // found a hole - fill it with data from the end of the cache
2155
-
2156
- uint32_t nh = 1 ;
2157
-
2158
- // determine the size of the hole
2159
- while (i0 + nh < n_used && kv_self->cells [i0 + nh].is_empty ()) {
2160
- nh++;
2161
- }
2162
-
2163
- uint32_t nf = 0 ;
2164
- uint32_t is = n_kv - 1 ;
2165
-
2166
- // starting from the end, find nh non-empty cells
2167
- for (; is > i0; --is) {
2168
- const auto & cell1 = kv_self->cells [is];
2169
-
2170
- if (cell1.is_empty () || ids[is] != n_kv) {
2171
- continue ;
2172
- }
2173
-
2174
- // non-empty cell which is not yet moved
2175
- nf++;
2176
-
2177
- if (nf == nh) {
2178
- break ;
2179
- }
2180
- }
2181
-
2182
- // this can only happen if `n_used` is not accurate, which would be a bug
2183
- GGML_ASSERT (nf == nh && " KV defrag bug: nf != nh" );
2184
-
2185
- nf = 0 ;
2186
-
2187
- uint32_t i1 = is;
2188
-
2189
- // are we moving a continuous block of memory?
2190
- bool cont = false ;
2191
-
2192
- // should we stop searching for the next move?
2193
- bool stop = false ;
2194
-
2195
- // go back and move the nf cells to the hole
2196
- for (; i1 < n_kv; ++i1) {
2197
- auto & cell1 = kv_self->cells [i1];
2198
-
2199
- if (cell1.is_empty () || ids[i1] != n_kv) {
2200
- if (n_moves == max_moves) {
2201
- stop = true ;
2202
- break ;
2203
- }
2204
-
2205
- cont = false ;
2206
- continue ;
2207
- }
2208
-
2209
- // this cell goes to (i0 + nf)
2210
- ids[i1] = i0 + nf;
2211
-
2212
- // move the cell meta data
2213
- kv_self->cells [i0 + nf] = cell1;
2214
-
2215
- // clear the old cell and move the head there
2216
- cell1 = llama_kv_cell ();
2217
- kv_self->head = n_used;
2218
-
2219
- if (!cont) {
2220
- n_moves++;
2221
- cont = true ;
2222
- }
2223
-
2224
- nf++;
2225
-
2226
- if (nf == nh) {
2227
- break ;
2228
- }
2229
- }
2230
-
2231
- if (stop || n_moves == max_moves) {
2232
- break ;
2233
- }
2234
-
2235
- // LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
2236
-
2237
- i0 += nh - 1 ;
2238
- }
2239
-
2240
- if (n_moves == 0 ) {
2241
- return ;
2242
- }
2243
-
2244
- // LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
2107
+ const llama_kv_cache_unified * kv_self = static_cast <const llama_kv_cache_unified *>(memory);
2245
2108
2246
- // LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer) ;
2109
+ const auto & ids = kv_self-> defrag_info . ids ;
2247
2110
2248
2111
#if 0
2249
2112
// CPU defrag
@@ -2424,8 +2287,8 @@ void llm_graph_context::build_pooling(ggml_cgraph * gf) const {
2424
2287
2425
2288
// classification head
2426
2289
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
2427
- GGML_ASSERT (model.cls != nullptr );
2428
- GGML_ASSERT (model.cls_b != nullptr );
2290
+ GGML_ASSERT (model.cls != nullptr );
2291
+ GGML_ASSERT (model.cls_b != nullptr );
2429
2292
2430
2293
cur = ggml_add (ctx0, ggml_mul_mat (ctx0, model.cls , inp), model.cls_b );
2431
2294
cur = ggml_tanh (ctx0, cur);
0 commit comments