@@ -598,12 +598,8 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
598598 ggml_tensor * shift,
599599 ggml_tensor * factors,
600600 float freq_base,
601- float freq_scale,
602- ggml_backend_buffer * bbuf) const {
601+ float freq_scale) const {
603602 const auto & cparams = lctx.get_cparams ();
604- const auto & backends = lctx.get_backends ();
605-
606- auto * sched = lctx.get_sched ();
607603
608604 const auto & n_ctx_orig = cparams.n_ctx_orig_yarn ;
609605
@@ -624,17 +620,6 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
624620 // dequantize to f32 -> RoPE -> quantize back
625621 tmp = ggml_cast (ctx, cur, GGML_TYPE_F32);
626622
627- // TODO: can we simplify/avoid this? [TAG_BACKENDS]
628- if (bbuf) {
629- for (const auto & backend : backends) {
630- // Figure out which backend KV cache belongs to
631- if (ggml_backend_supports_buft (backend.get (), ggml_backend_buffer_get_type (bbuf))) {
632- ggml_backend_sched_set_tensor_backend (sched, tmp, backend.get ());
633- break ;
634- }
635- }
636- }
637-
638623 tmp = ggml_rope_ext_inplace (ctx, tmp,
639624 shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
640625 yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
@@ -719,7 +704,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
719704 ggml_row_size (k_l[il]->type , n_embd_k_gqa),
720705 0 );
721706
722- ggml_tensor * cur = build_rope_shift (lctx, ctx, k, inp->k_shift , rope_factors, freq_base_l, freq_scale_l, k_l[il]-> buffer );
707+ ggml_tensor * cur = build_rope_shift (lctx, ctx, k, inp->k_shift , rope_factors, freq_base_l, freq_scale_l);
723708
724709 ggml_build_forward_expand (gf, cur);
725710 }
0 commit comments