@@ -57,6 +57,10 @@ uint32_t llama_context::n_ctx() const {
57
57
return cparams.n_ctx ;
58
58
}
59
59
60
+ uint32_t llama_context::n_ctx_per_seq () const {
61
+ return cparams.n_ctx / cparams.n_seq_max ;
62
+ }
63
+
60
64
uint32_t llama_context::n_batch () const {
61
65
return cparams.n_batch ;
62
66
}
@@ -122,8 +126,8 @@ void llama_context::synchronize() {
122
126
}
123
127
124
128
void llama_context::attach_threadpool (
125
- ggml_threadpool_t threadpool,
126
- ggml_threadpool_t threadpool_batch) {
129
+ ggml_threadpool_t threadpool,
130
+ ggml_threadpool_t threadpool_batch) {
127
131
this ->threadpool = threadpool;
128
132
this ->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
129
133
}
@@ -202,6 +206,86 @@ llama_perf_context_data llama_context::perf_get_data() const {
202
206
return data;
203
207
}
204
208
209
+ ggml_tensor * llama_context::build_cvec (
210
+ ggml_context * ctx0,
211
+ ggml_tensor * cur,
212
+ int il) {
213
+ return cvec.apply_to (ctx0, cur, il);
214
+ }
215
+
216
+ ggml_tensor * llama_context::build_lora_mm (
217
+ ggml_context * ctx0,
218
+ ggml_tensor * w,
219
+ ggml_tensor * cur) {
220
+ struct ggml_tensor * res = ggml_mul_mat (ctx0, w, cur);
221
+
222
+ for (const auto & lora : loras) {
223
+ struct llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
224
+ if (lw == nullptr ) {
225
+ continue ;
226
+ }
227
+
228
+ const float adapter_scale = lora.second ;
229
+ const float scale = lw->get_scale (lora.first ->alpha , adapter_scale);
230
+
231
+ struct ggml_tensor * ab_cur = ggml_mul_mat (
232
+ ctx0, lw->b ,
233
+ ggml_mul_mat (ctx0, lw->a , cur)
234
+ );
235
+
236
+ ab_cur = ggml_scale (ctx0, ab_cur, scale);
237
+ res = ggml_add (ctx0, res, ab_cur);
238
+ }
239
+
240
+ return res;
241
+ }
242
+
243
+ ggml_tensor * llama_context::build_lora_mm_id (
244
+ ggml_context * ctx0,
245
+ ggml_tensor * w,
246
+ ggml_tensor * cur,
247
+ ggml_tensor * ids) {
248
+ struct ggml_tensor * res = ggml_mul_mat_id (ctx0, w, cur, ids);
249
+ for (const auto & lora : loras) {
250
+ struct llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
251
+ if (lw == nullptr ) {
252
+ continue ;
253
+ }
254
+
255
+ const float alpha = lora.first ->alpha ;
256
+ const float rank = (float ) lw->b ->ne [0 ];
257
+ const float scale = alpha ? lora.second * alpha / rank : lora.second ;
258
+
259
+ struct ggml_tensor * ab_cur = ggml_mul_mat_id (
260
+ ctx0, lw->b ,
261
+ ggml_mul_mat_id (ctx0, lw->a , cur, ids),
262
+ ids
263
+ );
264
+
265
+ ab_cur = ggml_scale (ctx0, ab_cur, scale);
266
+ res = ggml_add (ctx0, res, ab_cur);
267
+ }
268
+
269
+ return res;
270
+ }
271
+
272
+ ggml_tensor * llama_context::build_rope_factors (int il) {
273
+ const auto & hparams = model.hparams ;
274
+
275
+ // choose long/short freq factors based on the context size
276
+ const auto n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max ;
277
+
278
+ if (model.layers [il].rope_freqs != nullptr ) {
279
+ return model.layers [il].rope_freqs ;
280
+ }
281
+
282
+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn ) {
283
+ return model.layers [il].rope_long ;
284
+ }
285
+
286
+ return model.layers [il].rope_short ;
287
+ }
288
+
205
289
void llama_context::perf_reset () {
206
290
t_start_us = ggml_time_us ();
207
291
t_eval_us = n_eval = 0 ;
@@ -217,7 +301,7 @@ llama_context_unified::llama_context_unified(
217
301
const llama_context_params & params,
218
302
build_graph_callback && cb_build_graph) :
219
303
llama_context(model),
220
- cb_build_graph(std::move(cb_build_graph)){
304
+ cb_build_graph(std::move(cb_build_graph)) {
221
305
222
306
const auto & hparams = model.hparams ;
223
307
@@ -1825,69 +1909,6 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) {
1825
1909
return n_outputs_max;
1826
1910
}
1827
1911
1828
- ggml_tensor * llama_context::build_cvec (
1829
- ggml_context * ctx0,
1830
- ggml_tensor * cur,
1831
- int il) {
1832
- return cvec.apply_to (ctx0, cur, il);
1833
- }
1834
-
1835
- ggml_tensor * llama_context::build_lora_mm (
1836
- ggml_context * ctx0,
1837
- ggml_tensor * w,
1838
- ggml_tensor * cur) {
1839
- struct ggml_tensor * res = ggml_mul_mat (ctx0, w, cur);
1840
-
1841
- for (const auto & lora : loras) {
1842
- struct llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
1843
- if (lw == nullptr ) {
1844
- continue ;
1845
- }
1846
-
1847
- const float adapter_scale = lora.second ;
1848
- const float scale = lw->get_scale (lora.first ->alpha , adapter_scale);
1849
-
1850
- struct ggml_tensor * ab_cur = ggml_mul_mat (
1851
- ctx0, lw->b ,
1852
- ggml_mul_mat (ctx0, lw->a , cur)
1853
- );
1854
-
1855
- ab_cur = ggml_scale (ctx0, ab_cur, scale);
1856
- res = ggml_add (ctx0, res, ab_cur);
1857
- }
1858
-
1859
- return res;
1860
- }
1861
-
1862
- ggml_tensor * llama_context::build_lora_mm_id (
1863
- ggml_context * ctx0,
1864
- ggml_tensor * w,
1865
- ggml_tensor * cur,
1866
- ggml_tensor * ids) {
1867
- struct ggml_tensor * res = ggml_mul_mat_id (ctx0, w, cur, ids);
1868
- for (const auto & lora : loras) {
1869
- struct llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
1870
- if (lw == nullptr ) {
1871
- continue ;
1872
- }
1873
-
1874
- const float alpha = lora.first ->alpha ;
1875
- const float rank = (float ) lw->b ->ne [0 ];
1876
- const float scale = alpha ? lora.second * alpha / rank : lora.second ;
1877
-
1878
- struct ggml_tensor * ab_cur = ggml_mul_mat_id (
1879
- ctx0, lw->b ,
1880
- ggml_mul_mat_id (ctx0, lw->a , cur, ids),
1881
- ids
1882
- );
1883
-
1884
- ab_cur = ggml_scale (ctx0, ab_cur, scale);
1885
- res = ggml_add (ctx0, res, ab_cur);
1886
- }
1887
-
1888
- return res;
1889
- }
1890
-
1891
1912
void llama_context_unified::kv_self_update () {
1892
1913
auto & kv = kv_self;
1893
1914
@@ -2189,23 +2210,6 @@ ggml_tensor * llama_context_unified::build_soft_max_ext(
2189
2210
return ggml_soft_max_ext (ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias );
2190
2211
}
2191
2212
2192
- ggml_tensor * llama_context_unified::get_rope_factors (int il) {
2193
- const auto & hparams = model.hparams ;
2194
-
2195
- // choose long/short freq factors based on the context size
2196
- const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max ;
2197
-
2198
- if (model.layers [il].rope_freqs != nullptr ) {
2199
- return model.layers [il].rope_freqs ;
2200
- }
2201
-
2202
- if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn ) {
2203
- return model.layers [il].rope_long ;
2204
- }
2205
-
2206
- return model.layers [il].rope_short ;
2207
- }
2208
-
2209
2213
ggml_tensor * llama_context_unified::build_inp_embd (
2210
2214
ggml_context * ctx0,
2211
2215
ggml_tensor * tok_embd,
@@ -2327,7 +2331,7 @@ void llama_context_unified::build_k_shift(
2327
2331
const int64_t n_head_kv = hparams.n_head_kv (il);
2328
2332
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa (il);
2329
2333
2330
- struct ggml_tensor * rope_factors = get_rope_factors (il);
2334
+ struct ggml_tensor * rope_factors = build_rope_factors (il);
2331
2335
2332
2336
struct ggml_tensor * k =
2333
2337
ggml_view_3d (ctx0, kv_self.k_l [il],
0 commit comments