From de30cb6a7a4d37a00c5507263f3dd8e6783aee29 Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Mon, 30 Jun 2025 13:35:55 +0200 Subject: [PATCH 01/10] Chore: batch prompts, extract tensors specific layer --- common/arg.cpp | 91 ++++++---- examples/eval-callback/eval-callback.cpp | 221 +++++++++++++---------- 2 files changed, 176 insertions(+), 136 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0d0daa3610105..74559d28a4474 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -891,51 +891,62 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context }; for (int i = 1; i < argc; i++) { - const std::string arg_prefix = "--"; + const std::string arg_prefix = "--"; - std::string arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - if (arg_to_options.find(arg) == arg_to_options.end()) { - throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); - } - auto opt = *arg_to_options[arg]; - if (opt.has_value_from_env()) { - fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); + std::string arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + // Skip --parse-layer and its value(s) + if (arg == "--parse-layer") { + // Assuming --parse-layer takes exactly 1 argument + if (i + 1 < argc) { + i++; // skip the next value as well } - try { - if (opt.handler_void) { - opt.handler_void(params); - continue; - } + continue; + } - // arg with single value - check_arg(i); - std::string val = argv[++i]; - if (opt.handler_int) { - opt.handler_int(params, std::stoi(val)); - continue; - } - if (opt.handler_string) { - opt.handler_string(params, val); - continue; - } + if (arg_to_options.find(arg) == arg_to_options.end()) { + throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); + } - // arg with 2 values - check_arg(i); - std::string val2 = argv[++i]; - if (opt.handler_str_str) { - opt.handler_str_str(params, val, val2); - continue; - } - } catch (std::exception & e) { - throw std::invalid_argument(string_format( - "error while handling argument \"%s\": %s\n\n" - "usage:\n%s\n\nto show complete usage, run with -h", - arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); - } + auto opt = *arg_to_options[arg]; + if (opt.has_value_from_env()) { + fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); + } + try { + if (opt.handler_void) { + opt.handler_void(params); + continue; + } + + // arg with single value + check_arg(i); + std::string val = argv[++i]; + if (opt.handler_int) { + opt.handler_int(params, std::stoi(val)); + continue; + } + if (opt.handler_string) { + opt.handler_string(params, val); + continue; + } + + // arg with 2 values + check_arg(i); + std::string val2 = argv[++i]; + if (opt.handler_str_str) { + opt.handler_str_str(params, val, val2); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(string_format( + "error while handling argument \"%s\": %s\n\n" + "usage:\n%s\n\nto show complete usage, run with -h", + arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); } +} postprocess_cpu_params(params.cpuparams, nullptr); postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index fb188f5a9e132..8c0e97f4c2ea8 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -3,17 +3,17 @@ #include "log.h" #include "llama.h" #include "ggml.h" +#include "sampling.h" #include #include #include +#include -/** - * This the arbitrary data which will be passed to each callback. - * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. - */ struct callback_data { std::vector data; + std::string parse_layer_name; + int current_token_index = -1; }; static std::string ggml_ne_string(const ggml_tensor * t) { @@ -27,89 +27,41 @@ static std::string ggml_ne_string(const ggml_tensor * t) { return str; } -static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { - GGML_ASSERT(n > 0); - float sum = 0; - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - LOG(" [\n"); - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - if (i2 == n && ne[2] > 2*n) { - LOG(" ..., \n"); - i2 = ne[2] - n; - } - LOG(" [\n"); - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - if (i1 == n && ne[1] > 2*n) { - LOG(" ..., \n"); - i1 = ne[1] - n; - } - LOG(" ["); - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - if (i0 == n && ne[0] > 2*n) { - LOG("..., "); - i0 = ne[0] - n; - } - size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; - float v; - if (type == GGML_TYPE_F16) { - v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); - } else if (type == GGML_TYPE_F32) { - v = *(float *) &data[i]; - } else if (type == GGML_TYPE_I32) { - v = (float) *(int32_t *) &data[i]; - } else if (type == GGML_TYPE_I16) { - v = (float) *(int16_t *) &data[i]; - } else if (type == GGML_TYPE_I8) { - v = (float) *(int8_t *) &data[i]; - } else { - GGML_ABORT("fatal error"); - } - LOG("%12.4f", v); - sum += v; - if (i0 < ne[0] - 1) LOG(", "); - } - LOG("],\n"); - } - LOG(" ],\n"); +static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t token_idx) { + const int64_t dim = ne[0]; + std::cout << "=== TOKEN " << token_idx << " ===\n"; + std::cout << "--- TENSOR: " << tensor_name << " ---\n"; + std::cout << "SHAPE: [" << dim << "]\n"; + std::cout << "DATA:\n"; + + for (int64_t i = 0; i < dim; ++i) { + size_t offset = i * nb[0]; + float v; + + switch (type) { + case GGML_TYPE_F16: v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[offset]); break; + case GGML_TYPE_F32: v = *(float *) &data[offset]; break; + default: GGML_ABORT("Unsupported tensor type"); } - LOG(" ]\n"); - LOG(" sum = %f\n", sum); + + std::cout << v; + if (i < dim - 1) std::cout << ", "; } + + std::cout << "\n\n"; } -/** - * GGML operations callback during the graph execution. - * - * @param t current tensor - * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor - * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. - * see ggml_backend_sched_eval_callback - * @param user_data user data to pass at each call back - * @return true to receive data or continue the graph, false otherwise - */ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (callback_data *) user_data; - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - if (ask) { - return true; // Always retrieve data + return std::string(t->name) == cb_data->parse_layer_name; } - char src1_str[128] = {0}; - if (src1) { - snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); + if (std::string(t->name) != cb_data->parse_layer_name) { + return false; } - LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); - - - // copy the data from the GPU memory if needed const bool is_host = ggml_backend_buffer_is_host(t->buffer); if (!is_host) { @@ -120,51 +72,122 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { if (!ggml_is_quantized(t->type)) { uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); + ggml_print_tensor_block(t->name, data, t->type, t->ne, t->nb, cb_data->current_token_index); } return true; } -static bool run(llama_context * ctx, const common_params & params) { +static bool run(llama_context * ctx, const common_params & params, callback_data & cb_data) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - const bool add_bos = llama_vocab_get_add_bos(vocab); std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); - if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { - LOG_ERR("%s : failed to eval\n", __func__); + auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = false; + llama_sampler * sampler = llama_sampler_chain_init(sparams); + llama_sampler_chain_add(sampler, llama_sampler_init_greedy()); + + llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); + cb_data.current_token_index = -1; + if (llama_decode(ctx, batch)) { + LOG_ERR("Failed to evaluate prompt\n"); + llama_sampler_free(sampler); return false; } + std::string result; + llama_token token; + + for (int i = 0; i < params.n_predict; ++i) { + token = llama_sampler_sample(sampler, ctx, -1); + if (llama_vocab_is_eog(vocab, token)) { + break; + } + + char buf[128]; + int n = llama_token_to_piece(vocab, token, buf, sizeof(buf), 0, true); + if (n < 0) { + LOG_ERR("Failed to convert token to string\n"); + llama_sampler_free(sampler); + return false; + } + result += std::string(buf, n); // <-- store instead of printing + + llama_batch new_batch = llama_batch_get_one(&token, 1); + cb_data.current_token_index = i; + if (llama_decode(ctx, new_batch)) { + LOG_ERR("Failed to decode sampled token\n"); + llama_sampler_free(sampler); + return false; + } + } + + llama_sampler_free(sampler); + + // Output final result + std::cout << "\n\nFull output:\n" << result << "\n"; + return true; } -int main(int argc, char ** argv) { - callback_data cb_data; +int main(int argc, char **argv) { + callback_data cb_data; common_params params; + std::string parse_layer_value; + std::vector filtered_argv; + std::vector prompts; + + filtered_argv.push_back(argv[0]); + + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg.compare(0, 2, "--") == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (arg == "--parse-layer") { + if (i + 1 < argc) { + parse_layer_value = argv[++i]; + } else { + fprintf(stderr, "error: --parse-layer requires an argument\n"); + return 1; + } + continue; + } else if (arg == "--prompt") { + if (i + 1 < argc) { + prompts.emplace_back(argv[++i]); + } else { + fprintf(stderr, "error: --prompt requires an argument\n"); + return 1; + } + continue; + } + + filtered_argv.push_back(argv[i]); + } + + if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) { return 1; } - common_init(); + if (!parse_layer_value.empty()) { + LOG_INF("Parse layer argument value: %s\n", parse_layer_value.c_str()); + } + cb_data.parse_layer_name = parse_layer_value; + common_init(); llama_backend_init(); llama_numa_init(params.numa); - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation params.cb_eval = ggml_debug; params.cb_eval_user_data = &cb_data; params.warmup = false; - // init common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); @@ -173,16 +196,22 @@ int main(int argc, char ** argv) { return 1; } - // print system information - { - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - LOG_INF("\n"); + LOG_INF("\n"); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + LOG_INF("\n"); + + if (prompts.empty()) { + prompts.emplace_back("What is the capital of France?"); // Fallback default } - bool OK = run(ctx, params); - if (!OK) { - return 1; + + for (const auto& prompt : prompts) { + LOG_INF("Running prompt: %s\n", prompt.c_str()); + params.prompt = prompt; + if (!run(ctx, params, cb_data)) { + LOG_ERR("Failed on prompt: %s\n", prompt.c_str()); + return 1; + } } LOG("\n"); From d512b8b905d4f39f3f251d97a38bafed3d56aad1 Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Mon, 30 Jun 2025 13:41:50 +0200 Subject: [PATCH 02/10] Chore: adjust readme --- examples/eval-callback/README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md index 63a57ad6b68e5..f9065cc368d3d 100644 --- a/examples/eval-callback/README.md +++ b/examples/eval-callback/README.md @@ -6,13 +6,16 @@ It simply prints to the console all operations and tensor data. Usage: ```shell -llama-eval-callback \ - --hf-repo ggml-org/models \ - --hf-file phi-2/ggml-model-q4_0.gguf \ - --model phi-2-q4_0.gguf \ - --prompt hello \ - --seed 42 \ - -ngl 33 +llama-eval-callback \ +--model path/to/model.gguf \ +--parse-layer l_out-31 \ +--n-predict 200 \ +--prompt "What is the capital of France?" \ +--prompt "Explain black holes" \ +--prompt "Give me a joke" > output.txt +--seed 42 \ +-ngl 33 + ``` Will print: From 9113f6b1cebf9b2088c05e6b2a70a7c012b861ea Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Fri, 4 Jul 2025 17:21:02 +0200 Subject: [PATCH 03/10] Feature: method to list possible layers to parse, set max number of layers to offload to gpu --- examples/eval-callback/eval-callback.cpp | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 8c0e97f4c2ea8..c5f6d93ab7741 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -55,6 +55,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (callback_data *) user_data; if (ask) { + if (cb_data->parse_layer_name == "__LIST__") { + std::cout << t->name << "\n"; + return false; + } return std::string(t->name) == cb_data->parse_layer_name; } @@ -137,11 +141,15 @@ static bool run(llama_context * ctx, const common_params & params, callback_data int main(int argc, char **argv) { callback_data cb_data; common_params params; + bool list_layers = false; + std::string list_layers_filter = ""; std::string parse_layer_value; std::vector filtered_argv; std::vector prompts; filtered_argv.push_back(argv[0]); + params.n_gpu_layers = 20; + for (int i = 1; i < argc; i++) { std::string arg = argv[i]; @@ -165,11 +173,27 @@ int main(int argc, char **argv) { return 1; } continue; + } else if (arg == "--n-gpu-layers") { + if (i + 1 < argc) { + params.n_gpu_layers = std::stoi(argv[++i]); // override default + } else { + fprintf(stderr, "error: --n-gpu-layers requires an integer argument\n"); + return 1; + } + continue; + } + else if (arg == "--list-layers") { + list_layers = true; + if (i + 1 < argc && argv[i + 1][0] != '-') { + list_layers_filter = argv[++i]; // take optional argument + } + continue; } filtered_argv.push_back(argv[i]); } + if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) { return 1; } @@ -187,6 +211,7 @@ int main(int argc, char **argv) { params.cb_eval_user_data = &cb_data; params.warmup = false; + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); @@ -204,6 +229,18 @@ int main(int argc, char **argv) { prompts.emplace_back("What is the capital of France?"); // Fallback default } + if (list_layers) { + cb_data.parse_layer_name = "__LIST__"; + params.n_predict = 1; + params.prompt = "dummy"; // any valid prompt to trigger eval + + if (!run(ctx, params, cb_data)) { + LOG_ERR("Failed during layer listing run\n"); + return 1; + } + return 0; + + } for (const auto& prompt : prompts) { LOG_INF("Running prompt: %s\n", prompt.c_str()); From 791fa52ee63f8502c73a377aeed1a648c3f64363 Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Sat, 5 Jul 2025 11:28:26 +0200 Subject: [PATCH 04/10] Fix: add include for ubuntu --- examples/eval-callback/eval-callback.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index c5f6d93ab7741..15f90e1005dd8 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -4,6 +4,7 @@ #include "llama.h" #include "ggml.h" #include "sampling.h" +#include #include #include From 25b0313fec3add3ab62c556afc94f71fda94e01c Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Sat, 5 Jul 2025 13:05:39 +0200 Subject: [PATCH 05/10] Fix: save tensors and prompt/output to different files --- examples/eval-callback/eval-callback.cpp | 39 ++++++++++++++++++------ 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 15f90e1005dd8..6c805aef217e7 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -11,6 +11,12 @@ #include #include +#include + +std::ofstream prompt_output_file; +std::ofstream tensor_output_file; + + struct callback_data { std::vector data; std::string parse_layer_name; @@ -30,10 +36,10 @@ static std::string ggml_ne_string(const ggml_tensor * t) { static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t token_idx) { const int64_t dim = ne[0]; - std::cout << "=== TOKEN " << token_idx << " ===\n"; - std::cout << "--- TENSOR: " << tensor_name << " ---\n"; - std::cout << "SHAPE: [" << dim << "]\n"; - std::cout << "DATA:\n"; + tensor_output_file << "=== TOKEN " << token_idx << " ===\n"; + tensor_output_file << "--- TENSOR: " << tensor_name << " ---\n"; + tensor_output_file << "SHAPE: [" << dim << "]\n"; + tensor_output_file << "DATA:\n"; for (int64_t i = 0; i < dim; ++i) { size_t offset = i * nb[0]; @@ -45,11 +51,11 @@ static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * da default: GGML_ABORT("Unsupported tensor type"); } - std::cout << v; - if (i < dim - 1) std::cout << ", "; + tensor_output_file << v; + if (i < dim - 1) tensor_output_file << ", "; } - std::cout << "\n\n"; + tensor_output_file << "\n\n"; } static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { @@ -57,7 +63,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { if (ask) { if (cb_data->parse_layer_name == "__LIST__") { - std::cout << t->name << "\n"; + tensor_output_file << t->name << "\n"; return false; } return std::string(t->name) == cb_data->parse_layer_name; @@ -133,13 +139,21 @@ static bool run(llama_context * ctx, const common_params & params, callback_data llama_sampler_free(sampler); // Output final result - std::cout << "\n\nFull output:\n" << result << "\n"; + prompt_output_file << "\n\nFull output:\n" << result << "\n"; return true; } int main(int argc, char **argv) { + prompt_output_file.open("prompt_output.txt"); + tensor_output_file.open("tensor_output.txt"); + + if (!prompt_output_file || !tensor_output_file) { + std::cerr << "❌ Failed to open output files.\n"; + return 1; + } + callback_data cb_data; common_params params; bool list_layers = false; @@ -239,12 +253,15 @@ int main(int argc, char **argv) { LOG_ERR("Failed during layer listing run\n"); return 1; } + prompt_output_file.close(); + tensor_output_file.close(); + return 0; } for (const auto& prompt : prompts) { - LOG_INF("Running prompt: %s\n", prompt.c_str()); + prompt_output_file << "Running prompt: " << prompt << "\n"; params.prompt = prompt; if (!run(ctx, params, cb_data)) { LOG_ERR("Failed on prompt: %s\n", prompt.c_str()); @@ -256,6 +273,8 @@ int main(int argc, char **argv) { llama_perf_context_print(ctx); llama_backend_free(); + prompt_output_file.close(); + tensor_output_file.close(); return 0; } From 4864d6b6286addcdbd866203c12c7df6f8ed7015 Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Mon, 28 Jul 2025 21:33:21 +0200 Subject: [PATCH 06/10] Fix: ensure unique tensor and output path when running multiple instances --- examples/eval-callback/eval-callback.cpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 6c805aef217e7..37b8d821599ff 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -146,8 +146,11 @@ static bool run(llama_context * ctx, const common_params & params, callback_data int main(int argc, char **argv) { - prompt_output_file.open("prompt_output.txt"); - tensor_output_file.open("tensor_output.txt"); + std::string output_prefix = "default"; + + prompt_output_file.open(output_prefix + "_prompt_output.txt"); + tensor_output_file.open(output_prefix + "_tensor_output.txt"); + if (!prompt_output_file || !tensor_output_file) { std::cerr << "❌ Failed to open output files.\n"; @@ -180,6 +183,7 @@ int main(int argc, char **argv) { return 1; } continue; + } else if (arg == "--prompt") { if (i + 1 < argc) { prompts.emplace_back(argv[++i]); @@ -188,6 +192,16 @@ int main(int argc, char **argv) { return 1; } continue; + + } else if (arg == "--output-prefix") { + if (i + 1 < argc) { + output_prefix = argv[++i]; + } else { + fprintf(stderr, "error: --output-prefix requires a string argument\n"); + return 1; + } + continue; + } else if (arg == "--n-gpu-layers") { if (i + 1 < argc) { params.n_gpu_layers = std::stoi(argv[++i]); // override default @@ -196,8 +210,8 @@ int main(int argc, char **argv) { return 1; } continue; - } - else if (arg == "--list-layers") { + + } else if (arg == "--list-layers") { list_layers = true; if (i + 1 < argc && argv[i + 1][0] != '-') { list_layers_filter = argv[++i]; // take optional argument @@ -209,6 +223,7 @@ int main(int argc, char **argv) { } + if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) { return 1; } From ecc230bfcfb205efbc016aa363575753e9bc2eee Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Mon, 28 Jul 2025 23:02:42 +0200 Subject: [PATCH 07/10] Fix: create files with unique names --- examples/eval-callback/eval-callback.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 37b8d821599ff..e3795afa4acb9 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -148,9 +148,6 @@ static bool run(llama_context * ctx, const common_params & params, callback_data int main(int argc, char **argv) { std::string output_prefix = "default"; - prompt_output_file.open(output_prefix + "_prompt_output.txt"); - tensor_output_file.open(output_prefix + "_tensor_output.txt"); - if (!prompt_output_file || !tensor_output_file) { std::cerr << "❌ Failed to open output files.\n"; @@ -222,7 +219,13 @@ int main(int argc, char **argv) { filtered_argv.push_back(argv[i]); } + prompt_output_file.open(output_prefix + "_prompt_output.txt"); + tensor_output_file.open(output_prefix + "_tensor_output.txt"); + if (!prompt_output_file || !tensor_output_file) { + std::cerr << "❌ Failed to open output files.\n"; + return 1; + } if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) { return 1; From 1bb0c35563527e697b2cf2742022420928cfbb99 Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Wed, 30 Jul 2025 10:44:31 +0200 Subject: [PATCH 08/10] Fix: create files check later --- examples/eval-callback/eval-callback.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index e3795afa4acb9..848d948415a0c 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -148,12 +148,6 @@ static bool run(llama_context * ctx, const common_params & params, callback_data int main(int argc, char **argv) { std::string output_prefix = "default"; - - if (!prompt_output_file || !tensor_output_file) { - std::cerr << "❌ Failed to open output files.\n"; - return 1; - } - callback_data cb_data; common_params params; bool list_layers = false; From 6f316002899d867d18931b17a2fd6284f40757d9 Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Sat, 6 Sep 2025 07:28:03 +0200 Subject: [PATCH 09/10] Feature: save data from multiple layers during inference --- examples/eval-callback/eval-callback.cpp | 172 ++++++++++++++++++----- 1 file changed, 134 insertions(+), 38 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 848d948415a0c..07578feb0edc9 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -13,16 +13,64 @@ #include +#include +#include +#include +#include + std::ofstream prompt_output_file; std::ofstream tensor_output_file; +// sanitize names like "blk.0.output" -> "blk_0_output" +static std::string sanitize(const std::string &s) { + std::string out = s; + for (char &c : out) { + if (c == '/' || c == '\\' || c == ' ' || c == ':' || c == '.' ) c = '_'; + } + return out; +} struct callback_data { std::vector data; - std::string parse_layer_name; - int current_token_index = -1; + + std::unordered_set exact_targets; + std::vector prefix_targets; + + int current_token_index = -1; + bool list_mode = false; + + // NEW: per-tensor streams + base directory + std::string base_dir; // e.g., "/tensors" + std::unordered_map> streams; }; +static bool matches_target(const std::string &name, const callback_data *cb) { + if (cb->exact_targets.find(name) != cb->exact_targets.end()) return true; + for (const auto &pref : cb->prefix_targets) { + if (name.rfind(pref, 0) == 0) return true; // starts_with + } + return false; +} + + +static std::ostream & get_stream_for(const std::string &name, callback_data *cb) { + auto it = cb->streams.find(name); + if (it != cb->streams.end()) return *it->second; + + const std::string fname = cb->base_dir + "/" + sanitize(name) + ".txt"; + auto ofs = std::make_unique(fname, std::ios::app); + if (!ofs->is_open()) { + // fall back to global file if something goes wrong + return tensor_output_file; + } + std::ostream &ref = *ofs; + cb->streams.emplace(name, std::move(ofs)); + return ref; +} + + + + static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; for (int i = 0; i < GGML_MAX_DIMS; ++i) { @@ -34,12 +82,17 @@ static std::string ggml_ne_string(const ggml_tensor * t) { return str; } -static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t token_idx) { +static void ggml_print_tensor_block(std::ostream &os, + const std::string& tensor_name, + uint8_t * data, ggml_type type, + const int64_t * ne, const size_t * nb, + int64_t token_idx) { const int64_t dim = ne[0]; - tensor_output_file << "=== TOKEN " << token_idx << " ===\n"; - tensor_output_file << "--- TENSOR: " << tensor_name << " ---\n"; - tensor_output_file << "SHAPE: [" << dim << "]\n"; - tensor_output_file << "DATA:\n"; + + os << "=== TOKEN " << token_idx << " ===\n"; + os << "--- TENSOR: " << tensor_name << " ---\n"; + os << "SHAPE: [" << dim << "]\n"; + os << "DATA:\n"; for (int64_t i = 0; i < dim; ++i) { size_t offset = i * nb[0]; @@ -51,44 +104,52 @@ static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * da default: GGML_ABORT("Unsupported tensor type"); } - tensor_output_file << v; - if (i < dim - 1) tensor_output_file << ", "; + os << v; + if (i < dim - 1) os << ", "; } - - tensor_output_file << "\n\n"; + os << "\n\n"; } static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { - auto * cb_data = (callback_data *) user_data; + auto * cb = (callback_data *) user_data; + const std::string name = t->name; if (ask) { - if (cb_data->parse_layer_name == "__LIST__") { - tensor_output_file << t->name << "\n"; + if (cb->list_mode) { + // print once per tensor name, return false so we don't hook/copy data + static std::unordered_set printed; + if (printed.insert(name).second) { + tensor_output_file << name << "\n"; + } return false; } - return std::string(t->name) == cb_data->parse_layer_name; + // normal (non-list) mode: only hook matches + return matches_target(name, cb); } - if (std::string(t->name) != cb_data->parse_layer_name) { + if (cb->list_mode) { + // we already printed in the ask branch return false; } - const bool is_host = ggml_backend_buffer_is_host(t->buffer); + if (!matches_target(name, cb)) return false; + const bool is_host = ggml_backend_buffer_is_host(t->buffer); if (!is_host) { auto n_bytes = ggml_nbytes(t); - cb_data->data.resize(n_bytes); - ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); + cb->data.resize(n_bytes); + ggml_backend_tensor_get(t, cb->data.data(), 0, n_bytes); } - if (!ggml_is_quantized(t->type)) { - uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor_block(t->name, data, t->type, t->ne, t->nb, cb_data->current_token_index); + uint8_t * data = is_host ? (uint8_t *) t->data : cb->data.data(); + std::ostream &os = get_stream_for(name, cb); + ggml_print_tensor_block(os, name, data, t->type, t->ne, t->nb, cb->current_token_index); + os.flush(); } - return true; } + static bool run(llama_context * ctx, const common_params & params, callback_data & cb_data) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -152,14 +213,14 @@ int main(int argc, char **argv) { common_params params; bool list_layers = false; std::string list_layers_filter = ""; - std::string parse_layer_value; + std::vector parse_layer_values; // multi or comma-separated std::vector filtered_argv; std::vector prompts; filtered_argv.push_back(argv[0]); params.n_gpu_layers = 20; - + // --------- ARG PARSING --------- for (int i = 1; i < argc; i++) { std::string arg = argv[i]; if (arg.compare(0, 2, "--") == 0) { @@ -168,7 +229,16 @@ int main(int argc, char **argv) { if (arg == "--parse-layer") { if (i + 1 < argc) { - parse_layer_value = argv[++i]; + std::string raw = argv[++i]; + // allow comma-separated list + size_t start = 0; + while (true) { + size_t pos = raw.find(',', start); + std::string item = raw.substr(start, pos - start); + if (!item.empty()) parse_layer_values.push_back(item); + if (pos == std::string::npos) break; + start = pos + 1; + } } else { fprintf(stderr, "error: --parse-layer requires an argument\n"); return 1; @@ -205,7 +275,7 @@ int main(int argc, char **argv) { } else if (arg == "--list-layers") { list_layers = true; if (i + 1 < argc && argv[i + 1][0] != '-') { - list_layers_filter = argv[++i]; // take optional argument + list_layers_filter = argv[++i]; // optional filter (unused below) } continue; } @@ -213,22 +283,46 @@ int main(int argc, char **argv) { filtered_argv.push_back(argv[i]); } + // open standard outputs prompt_output_file.open(output_prefix + "_prompt_output.txt"); tensor_output_file.open(output_prefix + "_tensor_output.txt"); - if (!prompt_output_file || !tensor_output_file) { std::cerr << "❌ Failed to open output files.\n"; return 1; } + // create tensors dir AFTER we know output_prefix + try { + std::filesystem::create_directories(output_prefix + "/tensors"); + } catch (const std::exception &e) { + std::cerr << "❌ Failed to create tensors directory: " << e.what() << "\n"; + return 1; + } + cb_data.base_dir = output_prefix + "/tensors"; + if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) { return 1; } - if (!parse_layer_value.empty()) { - LOG_INF("Parse layer argument value: %s\n", parse_layer_value.c_str()); + // configure selector sets + if (list_layers) { + cb_data.list_mode = true; + } else { + if (parse_layer_values.empty()) { + // sensible default (keeps legacy behavior) + cb_data.exact_targets.insert("l_out-31"); + } else { + for (auto s : parse_layer_values) { + if (s == "__LIST__") { cb_data.list_mode = true; continue; } + if (!s.empty() && s.back() == '*') { + s.pop_back(); // treat trailing * as prefix + if (!s.empty()) cb_data.prefix_targets.push_back(s); + } else { + cb_data.exact_targets.insert(s); + } + } + } } - cb_data.parse_layer_name = parse_layer_value; common_init(); llama_backend_init(); @@ -238,7 +332,6 @@ int main(int argc, char **argv) { params.cb_eval_user_data = &cb_data; params.warmup = false; - common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); @@ -253,11 +346,10 @@ int main(int argc, char **argv) { LOG_INF("\n"); if (prompts.empty()) { - prompts.emplace_back("What is the capital of France?"); // Fallback default + prompts.emplace_back("What is the capital of France?"); // fallback } - if (list_layers) { - cb_data.parse_layer_name = "__LIST__"; + if (cb_data.list_mode) { params.n_predict = 1; params.prompt = "dummy"; // any valid prompt to trigger eval @@ -267,9 +359,11 @@ int main(int argc, char **argv) { } prompt_output_file.close(); tensor_output_file.close(); - + // close any opened per-tensor streams + for (auto &kv : cb_data.streams) { + if (kv.second && kv.second->is_open()) kv.second->close(); + } return 0; - } for (const auto& prompt : prompts) { @@ -287,6 +381,8 @@ int main(int argc, char **argv) { llama_backend_free(); prompt_output_file.close(); tensor_output_file.close(); - + for (auto &kv : cb_data.streams) { + if (kv.second && kv.second->is_open()) kv.second->close(); + } return 0; } From 11a750518ac7905c188dc88a56587031c2c0b7ac Mon Sep 17 00:00:00 2001 From: "casper.dert" Date: Sat, 6 Sep 2025 10:58:39 +0200 Subject: [PATCH 10/10] Feature: save data from multiple layers during inference --- examples/eval-callback/eval-callback.cpp | 205 ++++++++++++++++------- 1 file changed, 141 insertions(+), 64 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 07578feb0edc9..a402baeb357f1 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -10,6 +10,7 @@ #include #include #include +#include "ggml-backend.h" #include @@ -44,6 +45,12 @@ struct callback_data { std::unordered_map> streams; }; +struct sampling_cfg { + int top_k = -1; // <1 = disabled + float top_p = 1.0f; // >=1 = disabled + float temp = 1.0f; // we will always apply temperature (min clamp) +}; + static bool matches_target(const std::string &name, const callback_data *cb) { if (cb->exact_targets.find(name) != cb->exact_targets.end()) return true; for (const auto &pref : cb->prefix_targets) { @@ -150,17 +157,38 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { } -static bool run(llama_context * ctx, const common_params & params, callback_data & cb_data) { +static bool run(llama_context * ctx, + const common_params & params, + const sampling_cfg & samp, + callback_data & cb_data){ const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); const bool add_bos = llama_vocab_get_add_bos(vocab); std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); - auto sparams = llama_sampler_chain_default_params(); - sparams.no_perf = false; - llama_sampler * sampler = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(sampler, llama_sampler_init_greedy()); + auto chain_params = llama_sampler_chain_default_params(); + chain_params.no_perf = false; + llama_sampler * sampler = llama_sampler_chain_init(chain_params); + + // Always apply provided temperature (clamped to >0 above) + llama_sampler_chain_add(sampler, llama_sampler_init_temp(samp.temp)); + + // Optional: top-k + if (samp.top_k > 0) { + llama_sampler_chain_add(sampler, llama_sampler_init_top_k(samp.top_k)); + } + + // Optional: top-p + if (samp.top_p < 1.0f) { + // min_keep = 1 is sane + llama_sampler_chain_add(sampler, llama_sampler_init_top_p(samp.top_p, 1)); + } + + // Add RNG distribution so temp/top-k/top-p actually randomize + uint32_t seed = (uint32_t) ggml_time_us(); + llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed)); + llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); cb_data.current_token_index = -1; @@ -210,6 +238,8 @@ int main(int argc, char **argv) { std::string output_prefix = "default"; callback_data cb_data; + sampling_cfg samp; // <-- add this + common_params params; bool list_layers = false; std::string list_layers_filter = ""; @@ -220,69 +250,116 @@ int main(int argc, char **argv) { filtered_argv.push_back(argv[0]); params.n_gpu_layers = 20; - // --------- ARG PARSING --------- - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - if (arg.compare(0, 2, "--") == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } +// --------- ARG PARSING --------- +for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg.compare(0, 2, "--") == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } - if (arg == "--parse-layer") { - if (i + 1 < argc) { - std::string raw = argv[++i]; - // allow comma-separated list - size_t start = 0; - while (true) { - size_t pos = raw.find(',', start); - std::string item = raw.substr(start, pos - start); - if (!item.empty()) parse_layer_values.push_back(item); - if (pos == std::string::npos) break; - start = pos + 1; - } - } else { - fprintf(stderr, "error: --parse-layer requires an argument\n"); - return 1; - } - continue; - - } else if (arg == "--prompt") { - if (i + 1 < argc) { - prompts.emplace_back(argv[++i]); - } else { - fprintf(stderr, "error: --prompt requires an argument\n"); - return 1; + // --parse-layer + if (arg == "--parse-layer") { + if (i + 1 < argc) { + std::string raw = argv[++i]; + size_t start = 0; + while (true) { + size_t pos = raw.find(',', start); + std::string item = raw.substr(start, pos - start); + if (!item.empty()) parse_layer_values.push_back(item); + if (pos == std::string::npos) break; + start = pos + 1; } - continue; - - } else if (arg == "--output-prefix") { - if (i + 1 < argc) { - output_prefix = argv[++i]; - } else { - fprintf(stderr, "error: --output-prefix requires a string argument\n"); - return 1; - } - continue; - - } else if (arg == "--n-gpu-layers") { - if (i + 1 < argc) { - params.n_gpu_layers = std::stoi(argv[++i]); // override default - } else { - fprintf(stderr, "error: --n-gpu-layers requires an integer argument\n"); - return 1; - } - continue; + } else { + fprintf(stderr, "error: --parse-layer requires an argument\n"); + return 1; + } + continue; + } - } else if (arg == "--list-layers") { - list_layers = true; - if (i + 1 < argc && argv[i + 1][0] != '-') { - list_layers_filter = argv[++i]; // optional filter (unused below) - } - continue; + // --prompt "..." + if (arg == "--prompt") { + if (i + 1 < argc) { + prompts.emplace_back(argv[++i]); + } else { + fprintf(stderr, "error: --prompt requires an argument\n"); + return 1; + } + continue; + } + + // --top-k N + if (arg == "--top-k") { + if (i + 1 < argc) { + samp.top_k = std::stoi(argv[++i]); + if (samp.top_k < 1) samp.top_k = -1; // disable if <1 + } else { + fprintf(stderr, "error: --top-k requires an int\n"); + return 1; } + continue; + } + + // --top-p F + if (arg == "--top-p") { + if (i + 1 < argc) { + samp.top_p = std::stof(argv[++i]); + if (samp.top_p <= 0.0f) samp.top_p = 1.0f; + if (samp.top_p > 1.0f) samp.top_p = 1.0f; // clamp + } else { + fprintf(stderr, "error: --top-p requires a float\n"); + return 1; + } + continue; + } - filtered_argv.push_back(argv[i]); + // --temp F (or --temperature F) + if (arg == "--temp" || arg == "--temperature") { + if (i + 1 < argc) { + samp.temp = std::stof(argv[++i]); + if (samp.temp <= 0.0f) samp.temp = 1e-6f; // avoid greedy (force >0) + } else { + fprintf(stderr, "error: --temperature requires a float\n"); + return 1; + } + continue; } + // --output-prefix STR + if (arg == "--output-prefix") { + if (i + 1 < argc) { + output_prefix = argv[++i]; + } else { + fprintf(stderr, "error: --output-prefix requires a string argument\n"); + return 1; + } + continue; + } + + // --n-gpu-layers N + if (arg == "--n-gpu-layers") { + if (i + 1 < argc) { + params.n_gpu_layers = std::stoi(argv[++i]); + } else { + fprintf(stderr, "error: --n-gpu-layers requires an integer argument\n"); + return 1; + } + continue; + } + + // --list-layers [optional_filter] + if (arg == "--list-layers") { + list_layers = true; + if (i + 1 < argc && argv[i + 1][0] != '-') { + list_layers_filter = argv[++i]; // optional, currently unused + } + continue; + } + + // Unrecognized flag/arg: pass through to common_params_parse + filtered_argv.push_back(argv[i]); +} + + // open standard outputs prompt_output_file.open(output_prefix + "_prompt_output.txt"); tensor_output_file.open(output_prefix + "_tensor_output.txt"); @@ -353,7 +430,7 @@ int main(int argc, char **argv) { params.n_predict = 1; params.prompt = "dummy"; // any valid prompt to trigger eval - if (!run(ctx, params, cb_data)) { + if (!run(ctx, params, samp, cb_data)) { LOG_ERR("Failed during layer listing run\n"); return 1; } @@ -369,13 +446,13 @@ int main(int argc, char **argv) { for (const auto& prompt : prompts) { prompt_output_file << "Running prompt: " << prompt << "\n"; params.prompt = prompt; - if (!run(ctx, params, cb_data)) { + if (!run(ctx, params, samp, cb_data)) { LOG_ERR("Failed on prompt: %s\n", prompt.c_str()); return 1; } } - LOG("\n"); + LOG_INF("\n"); llama_perf_context_print(ctx); llama_backend_free();