@@ -57,6 +57,10 @@ uint32_t llama_context::n_ctx() const {
5757 return cparams.n_ctx ;
5858}
5959
60+ uint32_t llama_context::n_ctx_per_seq () const {
61+ return cparams.n_ctx / cparams.n_seq_max ;
62+ }
63+
6064uint32_t llama_context::n_batch () const {
6165 return cparams.n_batch ;
6266}
@@ -122,8 +126,8 @@ void llama_context::synchronize() {
122126}
123127
124128void llama_context::attach_threadpool (
125- ggml_threadpool_t threadpool,
126- ggml_threadpool_t threadpool_batch) {
129+ ggml_threadpool_t threadpool,
130+ ggml_threadpool_t threadpool_batch) {
127131 this ->threadpool = threadpool;
128132 this ->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
129133}
@@ -202,6 +206,86 @@ llama_perf_context_data llama_context::perf_get_data() const {
202206 return data;
203207}
204208
209+ ggml_tensor * llama_context::build_cvec (
210+ ggml_context * ctx0,
211+ ggml_tensor * cur,
212+ int il) {
213+ return cvec.apply_to (ctx0, cur, il);
214+ }
215+
216+ ggml_tensor * llama_context::build_lora_mm (
217+ ggml_context * ctx0,
218+ ggml_tensor * w,
219+ ggml_tensor * cur) {
220+ struct ggml_tensor * res = ggml_mul_mat (ctx0, w, cur);
221+
222+ for (const auto & lora : loras) {
223+ struct llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
224+ if (lw == nullptr ) {
225+ continue ;
226+ }
227+
228+ const float adapter_scale = lora.second ;
229+ const float scale = lw->get_scale (lora.first ->alpha , adapter_scale);
230+
231+ struct ggml_tensor * ab_cur = ggml_mul_mat (
232+ ctx0, lw->b ,
233+ ggml_mul_mat (ctx0, lw->a , cur)
234+ );
235+
236+ ab_cur = ggml_scale (ctx0, ab_cur, scale);
237+ res = ggml_add (ctx0, res, ab_cur);
238+ }
239+
240+ return res;
241+ }
242+
243+ ggml_tensor * llama_context::build_lora_mm_id (
244+ ggml_context * ctx0,
245+ ggml_tensor * w,
246+ ggml_tensor * cur,
247+ ggml_tensor * ids) {
248+ struct ggml_tensor * res = ggml_mul_mat_id (ctx0, w, cur, ids);
249+ for (const auto & lora : loras) {
250+ struct llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
251+ if (lw == nullptr ) {
252+ continue ;
253+ }
254+
255+ const float alpha = lora.first ->alpha ;
256+ const float rank = (float ) lw->b ->ne [0 ];
257+ const float scale = alpha ? lora.second * alpha / rank : lora.second ;
258+
259+ struct ggml_tensor * ab_cur = ggml_mul_mat_id (
260+ ctx0, lw->b ,
261+ ggml_mul_mat_id (ctx0, lw->a , cur, ids),
262+ ids
263+ );
264+
265+ ab_cur = ggml_scale (ctx0, ab_cur, scale);
266+ res = ggml_add (ctx0, res, ab_cur);
267+ }
268+
269+ return res;
270+ }
271+
272+ ggml_tensor * llama_context::build_rope_factors (int il) {
273+ const auto & hparams = model.hparams ;
274+
275+ // choose long/short freq factors based on the context size
276+ const auto n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max ;
277+
278+ if (model.layers [il].rope_freqs != nullptr ) {
279+ return model.layers [il].rope_freqs ;
280+ }
281+
282+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn ) {
283+ return model.layers [il].rope_long ;
284+ }
285+
286+ return model.layers [il].rope_short ;
287+ }
288+
205289void llama_context::perf_reset () {
206290 t_start_us = ggml_time_us ();
207291 t_eval_us = n_eval = 0 ;
@@ -217,7 +301,7 @@ llama_context_unified::llama_context_unified(
217301 const llama_context_params & params,
218302 build_graph_callback && cb_build_graph) :
219303 llama_context(model),
220- cb_build_graph(std::move(cb_build_graph)){
304+ cb_build_graph(std::move(cb_build_graph)) {
221305
222306 const auto & hparams = model.hparams ;
223307
@@ -1825,69 +1909,6 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) {
18251909 return n_outputs_max;
18261910}
18271911
1828- ggml_tensor * llama_context::build_cvec (
1829- ggml_context * ctx0,
1830- ggml_tensor * cur,
1831- int il) {
1832- return cvec.apply_to (ctx0, cur, il);
1833- }
1834-
1835- ggml_tensor * llama_context::build_lora_mm (
1836- ggml_context * ctx0,
1837- ggml_tensor * w,
1838- ggml_tensor * cur) {
1839- struct ggml_tensor * res = ggml_mul_mat (ctx0, w, cur);
1840-
1841- for (const auto & lora : loras) {
1842- struct llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
1843- if (lw == nullptr ) {
1844- continue ;
1845- }
1846-
1847- const float adapter_scale = lora.second ;
1848- const float scale = lw->get_scale (lora.first ->alpha , adapter_scale);
1849-
1850- struct ggml_tensor * ab_cur = ggml_mul_mat (
1851- ctx0, lw->b ,
1852- ggml_mul_mat (ctx0, lw->a , cur)
1853- );
1854-
1855- ab_cur = ggml_scale (ctx0, ab_cur, scale);
1856- res = ggml_add (ctx0, res, ab_cur);
1857- }
1858-
1859- return res;
1860- }
1861-
1862- ggml_tensor * llama_context::build_lora_mm_id (
1863- ggml_context * ctx0,
1864- ggml_tensor * w,
1865- ggml_tensor * cur,
1866- ggml_tensor * ids) {
1867- struct ggml_tensor * res = ggml_mul_mat_id (ctx0, w, cur, ids);
1868- for (const auto & lora : loras) {
1869- struct llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
1870- if (lw == nullptr ) {
1871- continue ;
1872- }
1873-
1874- const float alpha = lora.first ->alpha ;
1875- const float rank = (float ) lw->b ->ne [0 ];
1876- const float scale = alpha ? lora.second * alpha / rank : lora.second ;
1877-
1878- struct ggml_tensor * ab_cur = ggml_mul_mat_id (
1879- ctx0, lw->b ,
1880- ggml_mul_mat_id (ctx0, lw->a , cur, ids),
1881- ids
1882- );
1883-
1884- ab_cur = ggml_scale (ctx0, ab_cur, scale);
1885- res = ggml_add (ctx0, res, ab_cur);
1886- }
1887-
1888- return res;
1889- }
1890-
18911912void llama_context_unified::kv_self_update () {
18921913 auto & kv = kv_self;
18931914
@@ -2189,23 +2210,6 @@ ggml_tensor * llama_context_unified::build_soft_max_ext(
21892210 return ggml_soft_max_ext (ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias );
21902211}
21912212
2192- ggml_tensor * llama_context_unified::get_rope_factors (int il) {
2193- const auto & hparams = model.hparams ;
2194-
2195- // choose long/short freq factors based on the context size
2196- const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max ;
2197-
2198- if (model.layers [il].rope_freqs != nullptr ) {
2199- return model.layers [il].rope_freqs ;
2200- }
2201-
2202- if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn ) {
2203- return model.layers [il].rope_long ;
2204- }
2205-
2206- return model.layers [il].rope_short ;
2207- }
2208-
22092213ggml_tensor * llama_context_unified::build_inp_embd (
22102214 ggml_context * ctx0,
22112215 ggml_tensor * tok_embd,
@@ -2327,7 +2331,7 @@ void llama_context_unified::build_k_shift(
23272331 const int64_t n_head_kv = hparams.n_head_kv (il);
23282332 const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa (il);
23292333
2330- struct ggml_tensor * rope_factors = get_rope_factors (il);
2334+ struct ggml_tensor * rope_factors = build_rope_factors (il);
23312335
23322336 struct ggml_tensor * k =
23332337 ggml_view_3d (ctx0, kv_self.k_l [il],
0 commit comments