Skip to content

Commit ebf1bdf

Browse files
committed
context : add logs
ggml-ci
1 parent 548c230 commit ebf1bdf

File tree

3 files changed

+76
-12
lines changed

3 files changed

+76
-12
lines changed

examples/save-load-state/save-load-state.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
1515
return 1;
1616
}
1717

18-
print_build_info();
18+
common_init();
1919

2020
if (params.n_predict < 0) {
2121
params.n_predict = 16;

src/llama-context.cpp

Lines changed: 69 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ llama_context::llama_context(
180180
llama_context::~llama_context() = default;
181181

182182
void llama_context::init() {
183+
LLAMA_LOG_DEBUG("%s: call\n", __func__);
184+
183185
const auto & hparams = model.hparams;
184186

185187
if (hparams.vocab_only) {
@@ -188,13 +190,15 @@ void llama_context::init() {
188190
}
189191

190192
{
191-
// buffer types used for the compute buffer of each backend
193+
LLAMA_LOG_DEBUG("%s: enumerating backends\n", __func__);
194+
192195
backend_buft.clear();
193196
backend_ptrs.clear();
194197

195198
for (auto & backend : backends) {
196199
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
197200
auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
201+
198202
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
199203
// use the host buffer of the first device CPU for faster transfer of the intermediate state
200204
auto * dev = model.devices[0];
@@ -203,14 +207,18 @@ void llama_context::init() {
203207
buft = host_buft;
204208
}
205209
}
210+
206211
backend_buft.push_back(buft);
207212
backend_ptrs.push_back(backend.get());
208213
}
209214

215+
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
216+
210217
const size_t max_nodes = this->max_nodes();
211218

219+
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
220+
212221
// buffer used to store the computation graph and the tensor meta data
213-
// TODO: move to base class
214222
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
215223

216224
// TODO: move these checks to ggml_backend_sched
@@ -247,6 +255,8 @@ void llama_context::init() {
247255
}
248256
}
249257

258+
LLAMA_LOG_DEBUG("%s: calling reserve()\n", __func__);
259+
250260
reserve();
251261
}
252262

@@ -286,15 +296,17 @@ void llama_context::reserve() {
286296

287297
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
288298

299+
// max number of outputs
300+
n_outputs = n_tokens;
301+
302+
LLAMA_LOG_DEBUG("%s: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
303+
289304
int n_splits_pp = -1;
290305
int n_nodes_pp = -1;
291306

292307
int n_splits_tg = -1;
293308
int n_nodes_tg = -1;
294309

295-
// max number of outputs
296-
n_outputs = n_tokens;
297-
298310
// reserve pp graph first so that buffers are only allocated once
299311
{
300312
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
@@ -521,21 +533,29 @@ int64_t llama_context::n_pos_per_token() const {
521533
void llama_context::attach_threadpool(
522534
ggml_threadpool_t threadpool,
523535
ggml_threadpool_t threadpool_batch) {
536+
LLAMA_LOG_DEBUG("%s: call\n", __func__);
537+
524538
this->threadpool = threadpool;
525539
this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
526540
}
527541

528542
void llama_context::detach_threadpool() {
543+
LLAMA_LOG_DEBUG("%s: call\n", __func__);
544+
529545
this->threadpool = nullptr;
530546
this->threadpool_batch = nullptr;
531547
}
532548

533549
void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) {
550+
LLAMA_LOG_DEBUG("%s: n_threads = %d, n_threads_batch = %d\n", __func__, n_threads, n_threads_batch);
551+
534552
cparams.n_threads = n_threads;
535553
cparams.n_threads_batch = n_threads_batch;
536554
}
537555

538556
void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) {
557+
LLAMA_LOG_DEBUG("%s: call\n", __func__);
558+
539559
this->abort_callback = abort_callback;
540560
this->abort_callback_data = abort_callback_data;
541561

@@ -549,21 +569,29 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
549569
}
550570

551571
void llama_context::set_embeddings(bool value) {
572+
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
573+
552574
cparams.embeddings = value;
553575
}
554576

555577
void llama_context::set_causal_attn(bool value) {
578+
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
579+
556580
cparams.causal_attn = value;
557581
}
558582

559583
void llama_context::set_adapter_lora(
560-
struct llama_adapter_lora * adapter,
584+
llama_adapter_lora * adapter,
561585
float scale) {
586+
LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
587+
562588
loras[adapter] = scale;
563589
}
564590

565591
bool llama_context::rm_adapter_lora(
566-
struct llama_adapter_lora * adapter) {
592+
llama_adapter_lora * adapter) {
593+
LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
594+
567595
auto pos = loras.find(adapter);
568596
if (pos != loras.end()) {
569597
loras.erase(pos);
@@ -574,6 +602,8 @@ bool llama_context::rm_adapter_lora(
574602
}
575603

576604
void llama_context::clear_adapter_lora() {
605+
LLAMA_LOG_DEBUG("%s: call\n", __func__);
606+
577607
loras.clear();
578608
}
579609

@@ -583,6 +613,8 @@ bool llama_context::apply_adapter_cvec(
583613
int32_t n_embd,
584614
int32_t il_start,
585615
int32_t il_end) {
616+
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
617+
586618
return cvec.apply(model, data, len, n_embd, il_start, il_end);
587619
}
588620

@@ -2085,15 +2117,21 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
20852117
}
20862118

20872119
size_t llama_context::state_get_data(llama_io_write_i & io) {
2120+
LLAMA_LOG_DEBUG("%s: writing state\n", __func__);
2121+
20882122
// write model info
20892123
{
2124+
LLAMA_LOG_DEBUG("%s: - writing model info\n", __func__);
2125+
20902126
const std::string arch_str = llm_arch_name(model.arch);
20912127
io.write_string(arch_str);
20922128
// TODO: add more model-specific info which should prevent loading the session file if not identical
20932129
}
20942130

20952131
// write output ids
20962132
{
2133+
LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
2134+
20972135
output_reorder();
20982136

20992137
const auto n_outputs = this->n_outputs;
@@ -2124,6 +2162,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
21242162

21252163
// write logits
21262164
{
2165+
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
2166+
21272167
const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
21282168

21292169
io.write(&logits_size, sizeof(logits_size));
@@ -2135,6 +2175,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
21352175

21362176
// write embeddings
21372177
{
2178+
LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__);
2179+
21382180
const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
21392181

21402182
io.write(&embd_size, sizeof(embd_size));
@@ -2148,8 +2190,12 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
21482190
}
21492191

21502192
size_t llama_context::state_set_data(llama_io_read_i & io) {
2193+
LLAMA_LOG_DEBUG("%s: reading state\n", __func__);
2194+
21512195
// read model info
21522196
{
2197+
LLAMA_LOG_DEBUG("%s: - reading model info\n", __func__);
2198+
21532199
const std::string cur_arch_str = llm_arch_name(model.arch);
21542200

21552201
std::string arch_str;
@@ -2162,6 +2208,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
21622208

21632209
// read output ids
21642210
{
2211+
LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__);
2212+
21652213
auto n_outputs = this->n_outputs;
21662214
io.read_to(&n_outputs, sizeof(n_outputs));
21672215

@@ -2189,6 +2237,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
21892237

21902238
// read logits
21912239
{
2240+
LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__);
2241+
21922242
uint64_t logits_size;
21932243
io.read_to(&logits_size, sizeof(logits_size));
21942244

@@ -2203,6 +2253,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
22032253

22042254
// read embeddings
22052255
{
2256+
LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__);
2257+
22062258
uint64_t embd_size;
22072259
io.read_to(&embd_size, sizeof(embd_size));
22082260

@@ -2285,6 +2337,8 @@ void llama_context_kv_self::reserve() {
22852337
// simulate full KV cache
22862338
kv_self.n = kv_self.size;
22872339

2340+
LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n);
2341+
22882342
llama_context::reserve();
22892343
}
22902344

@@ -2297,6 +2351,8 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
22972351
}
22982352

22992353
void llama_context_kv_self::kv_self_update() {
2354+
LLAMA_LOG_DEBUG("%s: kv_self_update()\n", __func__);
2355+
23002356
auto & kv = kv_self;
23012357

23022358
bool need_reserve = false;
@@ -2306,6 +2362,8 @@ void llama_context_kv_self::kv_self_update() {
23062362
GGML_ABORT("The current context does not support K-shift");
23072363
}
23082364

2365+
LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
2366+
23092367
// apply K-shift if needed
23102368
if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
23112369
ggml_backend_sched_reset(sched.get());
@@ -2334,6 +2392,8 @@ void llama_context_kv_self::kv_self_update() {
23342392

23352393
// defragment the KV cache if needed
23362394
if (kv.do_defrag) {
2395+
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
2396+
23372397
ggml_backend_sched_reset(sched.get());
23382398

23392399
auto * gf = graph_init();
@@ -3667,6 +3727,7 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross(
36673727
size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
36683728
llama_context::state_get_data(io);
36693729

3730+
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
36703731
kv_self.state_write(io);
36713732

36723733
return io.n_bytes();
@@ -3675,6 +3736,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
36753736
size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
36763737
llama_context::state_set_data(io);
36773738

3739+
LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
36783740
kv_self.state_read(io);
36793741

36803742
return io.n_bytes();

src/llama-context.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,13 @@ struct llama_context : public llama_graph_i {
5555

5656
virtual int32_t max_nodes() const;
5757

58-
// returns nullptr
58+
// self-attention:
59+
60+
// if the context does not have a KV cache, return nullptr
5961
virtual llama_kv_cache * get_kv_self();
6062
virtual const llama_kv_cache * get_kv_self() const;
6163

62-
// noop
64+
// if the context does not have a KV cache, noop
6365
virtual void kv_self_update();
6466

6567
virtual enum llama_pooling_type pooling_type() const;
@@ -87,11 +89,11 @@ struct llama_context : public llama_graph_i {
8789
virtual void set_causal_attn(bool value);
8890

8991
virtual void set_adapter_lora(
90-
struct llama_adapter_lora * adapter,
92+
llama_adapter_lora * adapter,
9193
float scale);
9294

9395
virtual bool rm_adapter_lora(
94-
struct llama_adapter_lora * adapter);
96+
llama_adapter_lora * adapter);
9597

9698
virtual void clear_adapter_lora();
9799

0 commit comments

Comments
 (0)