@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
469469        ggml_tensor * shift,
470470        ggml_tensor * factors,
471471              float    freq_base,
472-               float    freq_scale,
473-         ggml_backend_buffer * bbuf) const  {
472+               float    freq_scale) const  {
474473    const  auto  & n_ctx_orig = cparams.n_ctx_orig_yarn ;
475474
476475    const  auto  & yarn_ext_factor  = cparams.yarn_ext_factor ;
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
492491        //  dequantize to f32 -> RoPE -> quantize back
493492        tmp = ggml_cast (ctx0, cur, GGML_TYPE_F32);
494493
495-         if  (bbuf) {
496-             for  (const  auto  & backend : backends) {
497-                 //  Figure out which backend KV cache belongs to
498-                 if  (ggml_backend_supports_buft (backend.get (), ggml_backend_buffer_get_type (bbuf))) {
499-                     ggml_backend_sched_set_tensor_backend (sched.get (), tmp, backend.get ());
500-                     break ;
501-                 }
502-             }
503-         }
504- 
505-         tmp = ggml_rope_ext_inplace (ctx0, tmp,
494+         tmp = ggml_rope_ext (ctx0, tmp,
506495                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
507496                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
508497
@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
582571                ggml_row_size (kv_self->k_l [il]->type , n_embd_k_gqa),
583572                0 );
584573
585-         ggml_tensor * cur = build_rope_shift (ctx0, k, inp->k_shift , rope_factors, freq_base_l, freq_scale_l, kv_self-> k_l [il]-> buffer );
574+         ggml_tensor * cur = build_rope_shift (ctx0, k, inp->k_shift , rope_factors, freq_base_l, freq_scale_l);
586575
587576        ggml_build_forward_expand (gf, cur);
588577    }
0 commit comments