@@ -4416,6 +4416,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
44164416 return it->second;
44174417}
44184418
4419+ ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
4420+ // choose long/short freq factors based on the context size
4421+ if (layers[il].rope_freqs != nullptr) {
4422+ return layers[il].rope_freqs;
4423+ }
4424+
4425+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
4426+ return layers[il].rope_long;
4427+ }
4428+
4429+ return layers[il].rope_short;
4430+ }
4431+
44194432struct llm_build_llama : public llm_graph_context {
44204433 llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
44214434 const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -4456,7 +4469,7 @@ struct llm_build_llama : public llm_graph_context {
44564469 // self-attention
44574470 {
44584471 // rope freq factors for llama3; may return nullptr for llama2 and other models
4459- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
4472+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
44604473
44614474 // compute Q and K and RoPE them
44624475 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4681,7 +4694,7 @@ struct llm_build_deci : public llm_graph_context {
46814694 } else if (n_head > 0) {
46824695 // self-attention
46834696 // rope freq factors for llama3; may return nullptr for llama2 and other models
4684- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
4697+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
46854698
46864699 // compute Q and K and RoPE them
46874700 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -7141,7 +7154,7 @@ struct llm_build_phi3 : public llm_graph_context {
71417154 // self-attention
71427155 {
71437156 // rope freq factors for 128k context
7144- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
7157+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
71457158
71467159 ggml_tensor* attn_norm_output = build_norm(inpL,
71477160 model.layers[il].attn_norm,
@@ -7893,7 +7906,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
78937906 for (int il = 0; il < n_layer; ++il) {
78947907 ggml_tensor * inpSA = inpL;
78957908
7896- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
7909+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
78977910
78987911 // norm
78997912 cur = build_norm(inpL,
@@ -8961,7 +8974,7 @@ struct llm_build_cohere2 : public llm_graph_context {
89618974 // self-attention
89628975 {
89638976 // rope freq factors for 128k context
8964- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
8977+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
89658978
89668979 // compute Q and K and RoPE them
89678980 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9899,7 +9912,7 @@ struct llm_build_deepseek : public llm_graph_context {
98999912 // self-attention
99009913 {
99019914 // rope freq factors for llama3; may return nullptr for llama2 and other models
9902- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
9915+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
99039916
99049917 // compute Q and K and RoPE them
99059918 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11264,7 +11277,7 @@ struct llm_build_exaone : public llm_graph_context {
1126411277 // self-attention
1126511278 {
1126611279 // rope freq factors for llama3; may return nullptr for llama2 and other models
11267- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
11280+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
1126811281
1126911282 // compute Q and K and RoPE them
1127011283 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12645,7 +12658,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
1264512658 // self-attention
1264612659 {
1264712660 // rope freq factors for llama3; may return nullptr for llama2 and other models
12648- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
12661+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
1264912662
1265012663 // compute Q and K and RoPE them
1265112664 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12768,28 +12781,6 @@ struct llm_build_bailingmoe : public llm_graph_context {
1276812781llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
1276912782 llama_memory_i * res;
1277012783
12771- const bool offload = cparams.offload_kqv;
12772-
12773- auto get_buft = [this, offload](int il) {
12774- const char * dev_name = "CPU";
12775-
12776- ggml_backend_buffer_type_t buft;
12777- if (offload) {
12778- auto * dev = dev_layer(il);
12779- buft = ggml_backend_dev_buffer_type(dev);
12780-
12781- dev_name = ggml_backend_dev_name(dev);
12782- } else {
12783- buft = ggml_backend_cpu_buffer_type();
12784- }
12785-
12786- LLAMA_LOG_DEBUG("layer %3d: dev = %s\n", il, dev_name);
12787-
12788- return buft;
12789- };
12790-
12791- LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
12792-
1279312784 switch (arch) {
1279412785 case LLM_ARCH_MAMBA:
1279512786 case LLM_ARCH_RWKV6:
@@ -12798,13 +12789,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1279812789 case LLM_ARCH_ARWKV7:
1279912790 {
1280012791 res = new llama_kv_cache_recurrent(
12801- hparams,
12802- {
12803- /*.get_rope_factors =*/ nullptr,
12804- /*.get_buft =*/ get_buft,
12805- },
12792+ *this,
1280612793 GGML_TYPE_F32,
1280712794 GGML_TYPE_F32,
12795+ cparams.offload_kqv,
1280812796 std::max((uint32_t) 1, cparams.n_seq_max));
1280912797 } break;
1281012798 default:
@@ -12816,25 +12804,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1281612804 LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
1281712805
1281812806 res = new llama_kv_cache_unified(
12819- hparams,
12820- {
12821- /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
12822- // choose long/short freq factors based on the context size
12823- if (layers[il].rope_freqs != nullptr) {
12824- return layers[il].rope_freqs;
12825- }
12826-
12827- if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
12828- return layers[il].rope_long;
12829- }
12830-
12831- return layers[il].rope_short;
12832- },
12833- /*.get_buft =*/ get_buft,
12834- },
12807+ *this,
1283512808 params.type_k,
1283612809 params.type_v,
1283712810 !cparams.flash_attn,
12811+ cparams.offload_kqv,
1283812812 cparams.n_ctx,
1283912813 padding);
1284012814 }
0 commit comments