@@ -180,6 +180,8 @@ llama_context::llama_context(
180180llama_context::~llama_context () = default ;
181181
182182void llama_context::init () {
183+ LLAMA_LOG_DEBUG (" %s: call\n " , __func__);
184+
183185 const auto & hparams = model.hparams ;
184186
185187 if (hparams.vocab_only ) {
@@ -188,13 +190,15 @@ void llama_context::init() {
188190 }
189191
190192 {
191- // buffer types used for the compute buffer of each backend
193+ LLAMA_LOG_DEBUG (" %s: enumerating backends\n " , __func__);
194+
192195 backend_buft.clear ();
193196 backend_ptrs.clear ();
194197
195198 for (auto & backend : backends) {
196199 auto * buft = ggml_backend_get_default_buffer_type (backend.get ());
197200 auto backend_type = ggml_backend_dev_type (ggml_backend_get_device (backend.get ()));
201+
198202 if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices .empty ()) {
199203 // use the host buffer of the first device CPU for faster transfer of the intermediate state
200204 auto * dev = model.devices [0 ];
@@ -203,14 +207,18 @@ void llama_context::init() {
203207 buft = host_buft;
204208 }
205209 }
210+
206211 backend_buft.push_back (buft);
207212 backend_ptrs.push_back (backend.get ());
208213 }
209214
215+ LLAMA_LOG_DEBUG (" %s: backend_ptrs.size() = %zu\n " , __func__, backend_ptrs.size ());
216+
210217 const size_t max_nodes = this ->max_nodes ();
211218
219+ LLAMA_LOG_DEBUG (" %s: max_nodes = %zu\n " , __func__, max_nodes);
220+
212221 // buffer used to store the computation graph and the tensor meta data
213- // TODO: move to base class
214222 buf_compute_meta.resize (ggml_tensor_overhead ()*max_nodes + ggml_graph_overhead_custom (max_nodes, false ));
215223
216224 // TODO: move these checks to ggml_backend_sched
@@ -247,6 +255,8 @@ void llama_context::init() {
247255 }
248256 }
249257
258+ LLAMA_LOG_DEBUG (" %s: calling reserve()\n " , __func__);
259+
250260 reserve ();
251261}
252262
@@ -286,15 +296,17 @@ void llama_context::reserve() {
286296
287297 llama_token token = model.vocab .token_bos (); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
288298
299+ // max number of outputs
300+ n_outputs = n_tokens;
301+
302+ LLAMA_LOG_DEBUG (" %s: n_tokens = %d, n_seqs = %d, n_outputs = %d\n " , __func__, n_tokens, n_seqs, n_outputs);
303+
289304 int n_splits_pp = -1 ;
290305 int n_nodes_pp = -1 ;
291306
292307 int n_splits_tg = -1 ;
293308 int n_nodes_tg = -1 ;
294309
295- // max number of outputs
296- n_outputs = n_tokens;
297-
298310 // reserve pp graph first so that buffers are only allocated once
299311 {
300312 llama_ubatch ubatch_pp = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
@@ -521,21 +533,29 @@ int64_t llama_context::n_pos_per_token() const {
521533void llama_context::attach_threadpool (
522534 ggml_threadpool_t threadpool,
523535 ggml_threadpool_t threadpool_batch) {
536+ LLAMA_LOG_DEBUG (" %s: call\n " , __func__);
537+
524538 this ->threadpool = threadpool;
525539 this ->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
526540}
527541
528542void llama_context::detach_threadpool () {
543+ LLAMA_LOG_DEBUG (" %s: call\n " , __func__);
544+
529545 this ->threadpool = nullptr ;
530546 this ->threadpool_batch = nullptr ;
531547}
532548
533549void llama_context::set_n_threads (int32_t n_threads, int32_t n_threads_batch) {
550+ LLAMA_LOG_DEBUG (" %s: n_threads = %d, n_threads_batch = %d\n " , __func__, n_threads, n_threads_batch);
551+
534552 cparams.n_threads = n_threads;
535553 cparams.n_threads_batch = n_threads_batch;
536554}
537555
538556void llama_context::set_abort_callback (bool (*abort_callback)(void * data), void * abort_callback_data) {
557+ LLAMA_LOG_DEBUG (" %s: call\n " , __func__);
558+
539559 this ->abort_callback = abort_callback;
540560 this ->abort_callback_data = abort_callback_data;
541561
@@ -549,21 +569,29 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
549569}
550570
551571void llama_context::set_embeddings (bool value) {
572+ LLAMA_LOG_DEBUG (" %s: value = %d\n " , __func__, value);
573+
552574 cparams.embeddings = value;
553575}
554576
555577void llama_context::set_causal_attn (bool value) {
578+ LLAMA_LOG_DEBUG (" %s: value = %d\n " , __func__, value);
579+
556580 cparams.causal_attn = value;
557581}
558582
559583void llama_context::set_adapter_lora (
560- struct llama_adapter_lora * adapter,
584+ llama_adapter_lora * adapter,
561585 float scale) {
586+ LLAMA_LOG_DEBUG (" %s: adapter = %p, scale = %f\n " , __func__, (void *) adapter, scale);
587+
562588 loras[adapter] = scale;
563589}
564590
565591bool llama_context::rm_adapter_lora (
566- struct llama_adapter_lora * adapter) {
592+ llama_adapter_lora * adapter) {
593+ LLAMA_LOG_DEBUG (" %s: adapter = %p\n " , __func__, (void *) adapter);
594+
567595 auto pos = loras.find (adapter);
568596 if (pos != loras.end ()) {
569597 loras.erase (pos);
@@ -574,6 +602,8 @@ bool llama_context::rm_adapter_lora(
574602}
575603
576604void llama_context::clear_adapter_lora () {
605+ LLAMA_LOG_DEBUG (" %s: call\n " , __func__);
606+
577607 loras.clear ();
578608}
579609
@@ -583,6 +613,8 @@ bool llama_context::apply_adapter_cvec(
583613 int32_t n_embd,
584614 int32_t il_start,
585615 int32_t il_end) {
616+ LLAMA_LOG_DEBUG (" %s: il_start = %d, il_end = %d\n " , __func__, il_start, il_end);
617+
586618 return cvec.apply (model, data, len, n_embd, il_start, il_end);
587619}
588620
@@ -2085,15 +2117,21 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
20852117}
20862118
20872119size_t llama_context::state_get_data (llama_io_write_i & io) {
2120+ LLAMA_LOG_DEBUG (" %s: writing state\n " , __func__);
2121+
20882122 // write model info
20892123 {
2124+ LLAMA_LOG_DEBUG (" %s: - writing model info\n " , __func__);
2125+
20902126 const std::string arch_str = llm_arch_name (model.arch );
20912127 io.write_string (arch_str);
20922128 // TODO: add more model-specific info which should prevent loading the session file if not identical
20932129 }
20942130
20952131 // write output ids
20962132 {
2133+ LLAMA_LOG_DEBUG (" %s: - writing output ids\n " , __func__);
2134+
20972135 output_reorder ();
20982136
20992137 const auto n_outputs = this ->n_outputs ;
@@ -2124,6 +2162,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
21242162
21252163 // write logits
21262164 {
2165+ LLAMA_LOG_DEBUG (" %s: - writing logits\n " , __func__);
2166+
21272167 const uint64_t logits_size = std::min ((uint64_t ) this ->logits_size , (uint64_t ) n_outputs * model.vocab .n_tokens ());
21282168
21292169 io.write (&logits_size, sizeof (logits_size));
@@ -2135,6 +2175,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
21352175
21362176 // write embeddings
21372177 {
2178+ LLAMA_LOG_DEBUG (" %s: - writing embeddings\n " , __func__);
2179+
21382180 const uint64_t embd_size = std::min ((uint64_t ) this ->embd_size , (uint64_t ) n_outputs * model.hparams .n_embd );
21392181
21402182 io.write (&embd_size, sizeof (embd_size));
@@ -2148,8 +2190,12 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
21482190}
21492191
21502192size_t llama_context::state_set_data (llama_io_read_i & io) {
2193+ LLAMA_LOG_DEBUG (" %s: reading state\n " , __func__);
2194+
21512195 // read model info
21522196 {
2197+ LLAMA_LOG_DEBUG (" %s: - reading model info\n " , __func__);
2198+
21532199 const std::string cur_arch_str = llm_arch_name (model.arch );
21542200
21552201 std::string arch_str;
@@ -2162,6 +2208,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
21622208
21632209 // read output ids
21642210 {
2211+ LLAMA_LOG_DEBUG (" %s: - reading output ids\n " , __func__);
2212+
21652213 auto n_outputs = this ->n_outputs ;
21662214 io.read_to (&n_outputs, sizeof (n_outputs));
21672215
@@ -2189,6 +2237,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
21892237
21902238 // read logits
21912239 {
2240+ LLAMA_LOG_DEBUG (" %s: - reading logits\n " , __func__);
2241+
21922242 uint64_t logits_size;
21932243 io.read_to (&logits_size, sizeof (logits_size));
21942244
@@ -2203,6 +2253,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
22032253
22042254 // read embeddings
22052255 {
2256+ LLAMA_LOG_DEBUG (" %s: - reading embeddings\n " , __func__);
2257+
22062258 uint64_t embd_size;
22072259 io.read_to (&embd_size, sizeof (embd_size));
22082260
@@ -2285,6 +2337,8 @@ void llama_context_kv_self::reserve() {
22852337 // simulate full KV cache
22862338 kv_self.n = kv_self.size ;
22872339
2340+ LLAMA_LOG_DEBUG (" %s: kv_self.n = %u\n " , __func__, kv_self.n );
2341+
22882342 llama_context::reserve ();
22892343}
22902344
@@ -2297,6 +2351,8 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
22972351}
22982352
22992353void llama_context_kv_self::kv_self_update () {
2354+ LLAMA_LOG_DEBUG (" %s: kv_self_update()\n " , __func__);
2355+
23002356 auto & kv = kv_self;
23012357
23022358 bool need_reserve = false ;
@@ -2306,6 +2362,8 @@ void llama_context_kv_self::kv_self_update() {
23062362 GGML_ABORT (" The current context does not support K-shift" );
23072363 }
23082364
2365+ LLAMA_LOG_DEBUG (" %s: applying K-shift\n " , __func__);
2366+
23092367 // apply K-shift if needed
23102368 if (model.hparams .rope_type != LLAMA_ROPE_TYPE_NONE) {
23112369 ggml_backend_sched_reset (sched.get ());
@@ -2334,6 +2392,8 @@ void llama_context_kv_self::kv_self_update() {
23342392
23352393 // defragment the KV cache if needed
23362394 if (kv.do_defrag ) {
2395+ LLAMA_LOG_DEBUG (" %s: defragmenting KV cache\n " , __func__);
2396+
23372397 ggml_backend_sched_reset (sched.get ());
23382398
23392399 auto * gf = graph_init ();
@@ -3667,6 +3727,7 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross(
36673727size_t llama_context_kv_self::state_get_data (llama_io_write_i & io) {
36683728 llama_context::state_get_data (io);
36693729
3730+ LLAMA_LOG_DEBUG (" %s: - writing KV self\n " , __func__);
36703731 kv_self.state_write (io);
36713732
36723733 return io.n_bytes ();
@@ -3675,6 +3736,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
36753736size_t llama_context_kv_self::state_set_data (llama_io_read_i & io) {
36763737 llama_context::state_set_data (io);
36773738
3739+ LLAMA_LOG_DEBUG (" %s: - reading KV self\n " , __func__);
36783740 kv_self.state_read (io);
36793741
36803742 return io.n_bytes ();
0 commit comments