From de30cb6a7a4d37a00c5507263f3dd8e6783aee29 Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Mon, 30 Jun 2025 13:35:55 +0200
Subject: [PATCH 01/10] Chore: batch prompts, extract tensors specific layer

---
 common/arg.cpp                           |  91 ++++++----
 examples/eval-callback/eval-callback.cpp | 221 +++++++++++++----------
 2 files changed, 176 insertions(+), 136 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 0d0daa3610105..74559d28a4474 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -891,51 +891,62 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     };
 
     for (int i = 1; i < argc; i++) {
-        const std::string arg_prefix = "--";
+    const std::string arg_prefix = "--";
 
-        std::string arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-        if (arg_to_options.find(arg) == arg_to_options.end()) {
-            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
-        }
-        auto opt = *arg_to_options[arg];
-        if (opt.has_value_from_env()) {
-            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+    std::string arg = argv[i];
+    if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+        std::replace(arg.begin(), arg.end(), '_', '-');
+    }
+
+    // Skip --parse-layer and its value(s)
+    if (arg == "--parse-layer") {
+        // Assuming --parse-layer takes exactly 1 argument
+        if (i + 1 < argc) {
+            i++; // skip the next value as well
         }
-        try {
-            if (opt.handler_void) {
-                opt.handler_void(params);
-                continue;
-            }
+        continue;
+    }
 
-            // arg with single value
-            check_arg(i);
-            std::string val = argv[++i];
-            if (opt.handler_int) {
-                opt.handler_int(params, std::stoi(val));
-                continue;
-            }
-            if (opt.handler_string) {
-                opt.handler_string(params, val);
-                continue;
-            }
+    if (arg_to_options.find(arg) == arg_to_options.end()) {
+        throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+    }
 
-            // arg with 2 values
-            check_arg(i);
-            std::string val2 = argv[++i];
-            if (opt.handler_str_str) {
-                opt.handler_str_str(params, val, val2);
-                continue;
-            }
-        } catch (std::exception & e) {
-            throw std::invalid_argument(string_format(
-                "error while handling argument \"%s\": %s\n\n"
-                "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
-        }
+    auto opt = *arg_to_options[arg];
+    if (opt.has_value_from_env()) {
+        fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+    }
+    try {
+        if (opt.handler_void) {
+            opt.handler_void(params);
+            continue;
+        }
+
+        // arg with single value
+        check_arg(i);
+        std::string val = argv[++i];
+        if (opt.handler_int) {
+            opt.handler_int(params, std::stoi(val));
+            continue;
+        }
+        if (opt.handler_string) {
+            opt.handler_string(params, val);
+            continue;
+        }
+
+        // arg with 2 values
+        check_arg(i);
+        std::string val2 = argv[++i];
+        if (opt.handler_str_str) {
+            opt.handler_str_str(params, val, val2);
+            continue;
+        }
+    } catch (std::exception & e) {
+        throw std::invalid_argument(string_format(
+            "error while handling argument \"%s\": %s\n\n"
+            "usage:\n%s\n\nto show complete usage, run with -h",
+            arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
     }
+}
 
     postprocess_cpu_params(params.cpuparams,       nullptr);
     postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index fb188f5a9e132..8c0e97f4c2ea8 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -3,17 +3,17 @@
 #include "log.h"
 #include "llama.h"
 #include "ggml.h"
+#include "sampling.h"
 
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <iostream>
 
-/**
- * This the arbitrary data which will be passed to each callback.
- * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
- */
 struct callback_data {
     std::vector<uint8_t> data;
+    std::string parse_layer_name;
+    int current_token_index = -1;
 };
 
 static std::string ggml_ne_string(const ggml_tensor * t) {
@@ -27,89 +27,41 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
     return str;
 }
 
-static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
-    GGML_ASSERT(n > 0);
-    float sum = 0;
-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        LOG("                                     [\n");
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            if (i2 == n && ne[2] > 2*n) {
-                LOG("                                      ..., \n");
-                i2 = ne[2] - n;
-            }
-            LOG("                                      [\n");
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                if (i1 == n && ne[1] > 2*n) {
-                    LOG("                                       ..., \n");
-                    i1 = ne[1] - n;
-                }
-                LOG("                                       [");
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    if (i0 == n && ne[0] > 2*n) {
-                        LOG("..., ");
-                        i0 = ne[0] - n;
-                    }
-                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
-                    float v;
-                    if (type == GGML_TYPE_F16) {
-                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
-                    } else if (type == GGML_TYPE_F32) {
-                        v = *(float *) &data[i];
-                    } else if (type == GGML_TYPE_I32) {
-                        v = (float) *(int32_t *) &data[i];
-                    } else if (type == GGML_TYPE_I16) {
-                        v = (float) *(int16_t *) &data[i];
-                    } else if (type == GGML_TYPE_I8) {
-                        v = (float) *(int8_t *) &data[i];
-                    } else {
-                        GGML_ABORT("fatal error");
-                    }
-                    LOG("%12.4f", v);
-                    sum += v;
-                    if (i0 < ne[0] - 1) LOG(", ");
-                }
-                LOG("],\n");
-            }
-            LOG("                                      ],\n");
+static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t token_idx) {
+    const int64_t dim = ne[0];
+    std::cout << "=== TOKEN " << token_idx << " ===\n";
+    std::cout << "--- TENSOR: " << tensor_name << " ---\n";
+    std::cout << "SHAPE: [" << dim << "]\n";
+    std::cout << "DATA:\n";
+
+    for (int64_t i = 0; i < dim; ++i) {
+        size_t offset = i * nb[0];
+        float v;
+
+        switch (type) {
+            case GGML_TYPE_F16: v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[offset]); break;
+            case GGML_TYPE_F32: v = *(float *) &data[offset]; break;
+            default: GGML_ABORT("Unsupported tensor type");
         }
-        LOG("                                     ]\n");
-        LOG("                                     sum = %f\n", sum);
+
+        std::cout << v;
+        if (i < dim - 1) std::cout << ", ";
     }
+
+    std::cout << "\n\n";
 }
 
-/**
- * GGML operations callback during the graph execution.
- *
- * @param t current tensor
- * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
- *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
- *            see ggml_backend_sched_eval_callback
- * @param user_data user data to pass at each call back
- * @return true to receive data or continue the graph, false otherwise
- */
 static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     auto * cb_data = (callback_data *) user_data;
 
-    const struct ggml_tensor * src0 = t->src[0];
-    const struct ggml_tensor * src1 = t->src[1];
-
     if (ask) {
-        return true; // Always retrieve data
+        return std::string(t->name) == cb_data->parse_layer_name;
     }
 
-    char src1_str[128] = {0};
-    if (src1) {
-        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+    if (std::string(t->name) != cb_data->parse_layer_name) {
+        return false;
     }
 
-    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-         t->name, ggml_type_name(t->type), ggml_op_desc(t),
-         src0->name, ggml_ne_string(src0).c_str(),
-         src1 ? src1_str : "",
-         ggml_ne_string(t).c_str());
-
-
-    // copy the data from the GPU memory if needed
     const bool is_host = ggml_backend_buffer_is_host(t->buffer);
 
     if (!is_host) {
@@ -120,51 +72,122 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 
     if (!ggml_is_quantized(t->type)) {
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+        ggml_print_tensor_block(t->name, data, t->type, t->ne, t->nb, cb_data->current_token_index);
     }
 
     return true;
 }
 
-static bool run(llama_context * ctx, const common_params & params) {
+static bool run(llama_context * ctx, const common_params & params, callback_data & cb_data) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
-
     const bool add_bos = llama_vocab_get_add_bos(vocab);
 
     std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
 
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
-        LOG_ERR("%s : failed to eval\n", __func__);
+    auto sparams = llama_sampler_chain_default_params();
+    sparams.no_perf = false;
+    llama_sampler * sampler = llama_sampler_chain_init(sparams);
+    llama_sampler_chain_add(sampler, llama_sampler_init_greedy());
+
+    llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
+    cb_data.current_token_index = -1;
+    if (llama_decode(ctx, batch)) {
+        LOG_ERR("Failed to evaluate prompt\n");
+        llama_sampler_free(sampler);
         return false;
     }
 
+    std::string result;
+    llama_token token;
+
+    for (int i = 0; i < params.n_predict; ++i) {
+        token = llama_sampler_sample(sampler, ctx, -1);
+        if (llama_vocab_is_eog(vocab, token)) {
+            break;
+        }
+
+        char buf[128];
+        int n = llama_token_to_piece(vocab, token, buf, sizeof(buf), 0, true);
+        if (n < 0) {
+            LOG_ERR("Failed to convert token to string\n");
+            llama_sampler_free(sampler);
+            return false;
+        }
+        result += std::string(buf, n);  // <-- store instead of printing
+
+        llama_batch new_batch = llama_batch_get_one(&token, 1);
+        cb_data.current_token_index = i;
+        if (llama_decode(ctx, new_batch)) {
+            LOG_ERR("Failed to decode sampled token\n");
+            llama_sampler_free(sampler);
+            return false;
+        }
+    }
+
+    llama_sampler_free(sampler);
+
+    // Output final result
+    std::cout << "\n\nFull output:\n" << result << "\n";
+
     return true;
 }
 
-int main(int argc, char ** argv) {
-    callback_data cb_data;
 
+int main(int argc, char **argv) {
+    callback_data cb_data;
     common_params params;
+    std::string parse_layer_value;
+    std::vector<char*> filtered_argv;
+    std::vector<std::string> prompts;
+
+    filtered_argv.push_back(argv[0]);
+
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg.compare(0, 2, "--") == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        if (arg == "--parse-layer") {
+            if (i + 1 < argc) {
+                parse_layer_value = argv[++i];
+            } else {
+                fprintf(stderr, "error: --parse-layer requires an argument\n");
+                return 1;
+            }
+            continue;
+        } else if (arg == "--prompt") {
+            if (i + 1 < argc) {
+                prompts.emplace_back(argv[++i]);
+            } else {
+                fprintf(stderr, "error: --prompt requires an argument\n");
+                return 1;
+            }
+            continue;
+        }
+
+        filtered_argv.push_back(argv[i]);
+    }
+
+    if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
-    common_init();
+    if (!parse_layer_value.empty()) {
+        LOG_INF("Parse layer argument value: %s\n", parse_layer_value.c_str());
+    }
+    cb_data.parse_layer_name = parse_layer_value;
 
+    common_init();
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    // pass the callback to the backend scheduler
-    // it will be executed for each node during the graph computation
     params.cb_eval = ggml_debug;
     params.cb_eval_user_data = &cb_data;
     params.warmup = false;
 
-    // init
     common_init_result llama_init = common_init_from_params(params);
-
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
 
@@ -173,16 +196,22 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    // print system information
-    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-        LOG_INF("\n");
+    LOG_INF("\n");
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    LOG_INF("\n");
+
+    if (prompts.empty()) {
+        prompts.emplace_back("What is the capital of France?");  // Fallback default
     }
 
-    bool OK = run(ctx, params);
-    if (!OK) {
-        return 1;
+
+    for (const auto& prompt : prompts) {
+        LOG_INF("Running prompt: %s\n", prompt.c_str());
+        params.prompt = prompt;
+        if (!run(ctx, params, cb_data)) {
+            LOG_ERR("Failed on prompt: %s\n", prompt.c_str());
+            return 1;
+        }
     }
 
     LOG("\n");

From d512b8b905d4f39f3f251d97a38bafed3d56aad1 Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Mon, 30 Jun 2025 13:41:50 +0200
Subject: [PATCH 02/10] Chore: adjust readme

---
 examples/eval-callback/README.md | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md
index 63a57ad6b68e5..f9065cc368d3d 100644
--- a/examples/eval-callback/README.md
+++ b/examples/eval-callback/README.md
@@ -6,13 +6,16 @@ It simply prints to the console all operations and tensor data.
 Usage:
 
 ```shell
-llama-eval-callback \
-  --hf-repo ggml-org/models \
-  --hf-file phi-2/ggml-model-q4_0.gguf \
-  --model phi-2-q4_0.gguf \
-  --prompt hello \
-  --seed 42 \
-  -ngl 33
+llama-eval-callback \                                                 
+--model path/to/model.gguf \
+--parse-layer l_out-31 \
+--n-predict 200 \
+--prompt "What is the capital of France?" \
+--prompt "Explain black holes" \
+--prompt "Give me a joke" > output.txt
+--seed 42 \
+-ngl 33
+
 ```
 
 Will print:

From 9113f6b1cebf9b2088c05e6b2a70a7c012b861ea Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Fri, 4 Jul 2025 17:21:02 +0200
Subject: [PATCH 03/10] Feature: method to list possible layers to parse, set
 max number of layers to offload to gpu

---
 examples/eval-callback/eval-callback.cpp | 37 ++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 8c0e97f4c2ea8..c5f6d93ab7741 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -55,6 +55,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     auto * cb_data = (callback_data *) user_data;
 
     if (ask) {
+        if (cb_data->parse_layer_name == "__LIST__") {
+            std::cout << t->name << "\n";
+            return false;
+        }
         return std::string(t->name) == cb_data->parse_layer_name;
     }
 
@@ -137,11 +141,15 @@ static bool run(llama_context * ctx, const common_params & params, callback_data
 int main(int argc, char **argv) {
     callback_data cb_data;
     common_params params;
+    bool list_layers = false;
+    std::string list_layers_filter = "";
     std::string parse_layer_value;
     std::vector<char*> filtered_argv;
     std::vector<std::string> prompts;
 
     filtered_argv.push_back(argv[0]);
+    params.n_gpu_layers = 20;
+
 
     for (int i = 1; i < argc; i++) {
         std::string arg = argv[i];
@@ -165,11 +173,27 @@ int main(int argc, char **argv) {
                 return 1;
             }
             continue;
+        } else if (arg == "--n-gpu-layers") {
+            if (i + 1 < argc) {
+                params.n_gpu_layers = std::stoi(argv[++i]);  // override default
+            } else {
+                fprintf(stderr, "error: --n-gpu-layers requires an integer argument\n");
+                return 1;
+            }
+            continue;
+        }
+        else if (arg == "--list-layers") {
+            list_layers = true;
+            if (i + 1 < argc && argv[i + 1][0] != '-') {
+                list_layers_filter = argv[++i];  // take optional argument
+            }
+            continue;
         }
 
         filtered_argv.push_back(argv[i]);
     }
 
+
     if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
@@ -187,6 +211,7 @@ int main(int argc, char **argv) {
     params.cb_eval_user_data = &cb_data;
     params.warmup = false;
 
+
     common_init_result llama_init = common_init_from_params(params);
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
@@ -204,6 +229,18 @@ int main(int argc, char **argv) {
         prompts.emplace_back("What is the capital of France?");  // Fallback default
     }
 
+    if (list_layers) {
+        cb_data.parse_layer_name = "__LIST__";
+        params.n_predict = 1;
+        params.prompt = "dummy";  // any valid prompt to trigger eval
+
+        if (!run(ctx, params, cb_data)) {
+            LOG_ERR("Failed during layer listing run\n");
+            return 1;
+        }
+        return 0;
+
+    }
 
     for (const auto& prompt : prompts) {
         LOG_INF("Running prompt: %s\n", prompt.c_str());

From 791fa52ee63f8502c73a377aeed1a648c3f64363 Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Sat, 5 Jul 2025 11:28:26 +0200
Subject: [PATCH 04/10] Fix: add include for ubuntu

---
 examples/eval-callback/eval-callback.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index c5f6d93ab7741..15f90e1005dd8 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -4,6 +4,7 @@
 #include "llama.h"
 #include "ggml.h"
 #include "sampling.h"
+#include <algorithm>
 
 #include <cstdio>
 #include <string>

From 25b0313fec3add3ab62c556afc94f71fda94e01c Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Sat, 5 Jul 2025 13:05:39 +0200
Subject: [PATCH 05/10] Fix: save tensors and prompt/output to different files

---
 examples/eval-callback/eval-callback.cpp | 39 ++++++++++++++++++------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 15f90e1005dd8..6c805aef217e7 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -11,6 +11,12 @@
 #include <vector>
 #include <iostream>
 
+#include <fstream>
+
+std::ofstream prompt_output_file;
+std::ofstream tensor_output_file;
+
+
 struct callback_data {
     std::vector<uint8_t> data;
     std::string parse_layer_name;
@@ -30,10 +36,10 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
 
 static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t token_idx) {
     const int64_t dim = ne[0];
-    std::cout << "=== TOKEN " << token_idx << " ===\n";
-    std::cout << "--- TENSOR: " << tensor_name << " ---\n";
-    std::cout << "SHAPE: [" << dim << "]\n";
-    std::cout << "DATA:\n";
+    tensor_output_file << "=== TOKEN " << token_idx << " ===\n";
+    tensor_output_file << "--- TENSOR: " << tensor_name << " ---\n";
+    tensor_output_file << "SHAPE: [" << dim << "]\n";
+    tensor_output_file << "DATA:\n";
 
     for (int64_t i = 0; i < dim; ++i) {
         size_t offset = i * nb[0];
@@ -45,11 +51,11 @@ static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * da
             default: GGML_ABORT("Unsupported tensor type");
         }
 
-        std::cout << v;
-        if (i < dim - 1) std::cout << ", ";
+        tensor_output_file << v;
+        if (i < dim - 1) tensor_output_file << ", ";
     }
 
-    std::cout << "\n\n";
+    tensor_output_file << "\n\n";
 }
 
 static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -57,7 +63,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 
     if (ask) {
         if (cb_data->parse_layer_name == "__LIST__") {
-            std::cout << t->name << "\n";
+            tensor_output_file << t->name << "\n";
             return false;
         }
         return std::string(t->name) == cb_data->parse_layer_name;
@@ -133,13 +139,21 @@ static bool run(llama_context * ctx, const common_params & params, callback_data
     llama_sampler_free(sampler);
 
     // Output final result
-    std::cout << "\n\nFull output:\n" << result << "\n";
+    prompt_output_file << "\n\nFull output:\n" << result << "\n";
 
     return true;
 }
 
 
 int main(int argc, char **argv) {
+    prompt_output_file.open("prompt_output.txt");
+    tensor_output_file.open("tensor_output.txt");
+
+    if (!prompt_output_file || !tensor_output_file) {
+        std::cerr << "❌ Failed to open output files.\n";
+        return 1;
+    }
+
     callback_data cb_data;
     common_params params;
     bool list_layers = false;
@@ -239,12 +253,15 @@ int main(int argc, char **argv) {
             LOG_ERR("Failed during layer listing run\n");
             return 1;
         }
+        prompt_output_file.close();
+        tensor_output_file.close();
+
         return 0;
 
     }
 
     for (const auto& prompt : prompts) {
-        LOG_INF("Running prompt: %s\n", prompt.c_str());
+        prompt_output_file << "Running prompt: " << prompt << "\n";
         params.prompt = prompt;
         if (!run(ctx, params, cb_data)) {
             LOG_ERR("Failed on prompt: %s\n", prompt.c_str());
@@ -256,6 +273,8 @@ int main(int argc, char **argv) {
     llama_perf_context_print(ctx);
 
     llama_backend_free();
+    prompt_output_file.close();
+    tensor_output_file.close();
 
     return 0;
 }

From 4864d6b6286addcdbd866203c12c7df6f8ed7015 Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Mon, 28 Jul 2025 21:33:21 +0200
Subject: [PATCH 06/10] Fix: ensure unique tensor and output path when running
 multiple instances

---
 examples/eval-callback/eval-callback.cpp | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 6c805aef217e7..37b8d821599ff 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -146,8 +146,11 @@ static bool run(llama_context * ctx, const common_params & params, callback_data
 
 
 int main(int argc, char **argv) {
-    prompt_output_file.open("prompt_output.txt");
-    tensor_output_file.open("tensor_output.txt");
+    std::string output_prefix = "default";
+
+    prompt_output_file.open(output_prefix + "_prompt_output.txt");
+    tensor_output_file.open(output_prefix + "_tensor_output.txt");
+
 
     if (!prompt_output_file || !tensor_output_file) {
         std::cerr << "❌ Failed to open output files.\n";
@@ -180,6 +183,7 @@ int main(int argc, char **argv) {
                 return 1;
             }
             continue;
+
         } else if (arg == "--prompt") {
             if (i + 1 < argc) {
                 prompts.emplace_back(argv[++i]);
@@ -188,6 +192,16 @@ int main(int argc, char **argv) {
                 return 1;
             }
             continue;
+
+        } else if (arg == "--output-prefix") {
+            if (i + 1 < argc) {
+                output_prefix = argv[++i];
+            } else {
+                fprintf(stderr, "error: --output-prefix requires a string argument\n");
+                return 1;
+            }
+            continue;
+
         } else if (arg == "--n-gpu-layers") {
             if (i + 1 < argc) {
                 params.n_gpu_layers = std::stoi(argv[++i]);  // override default
@@ -196,8 +210,8 @@ int main(int argc, char **argv) {
                 return 1;
             }
             continue;
-        }
-        else if (arg == "--list-layers") {
+
+        } else if (arg == "--list-layers") {
             list_layers = true;
             if (i + 1 < argc && argv[i + 1][0] != '-') {
                 list_layers_filter = argv[++i];  // take optional argument
@@ -209,6 +223,7 @@ int main(int argc, char **argv) {
     }
 
 
+
     if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }

From ecc230bfcfb205efbc016aa363575753e9bc2eee Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Mon, 28 Jul 2025 23:02:42 +0200
Subject: [PATCH 07/10] Fix: create files with unique names

---
 examples/eval-callback/eval-callback.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 37b8d821599ff..e3795afa4acb9 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -148,9 +148,6 @@ static bool run(llama_context * ctx, const common_params & params, callback_data
 int main(int argc, char **argv) {
     std::string output_prefix = "default";
 
-    prompt_output_file.open(output_prefix + "_prompt_output.txt");
-    tensor_output_file.open(output_prefix + "_tensor_output.txt");
-
 
     if (!prompt_output_file || !tensor_output_file) {
         std::cerr << "❌ Failed to open output files.\n";
@@ -222,7 +219,13 @@ int main(int argc, char **argv) {
         filtered_argv.push_back(argv[i]);
     }
 
+    prompt_output_file.open(output_prefix + "_prompt_output.txt");
+    tensor_output_file.open(output_prefix + "_tensor_output.txt");
 
+    if (!prompt_output_file || !tensor_output_file) {
+        std::cerr << "❌ Failed to open output files.\n";
+        return 1;
+    }
 
     if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) {
         return 1;

From 1bb0c35563527e697b2cf2742022420928cfbb99 Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Wed, 30 Jul 2025 10:44:31 +0200
Subject: [PATCH 08/10] Fix: create files check later

---
 examples/eval-callback/eval-callback.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index e3795afa4acb9..848d948415a0c 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -148,12 +148,6 @@ static bool run(llama_context * ctx, const common_params & params, callback_data
 int main(int argc, char **argv) {
     std::string output_prefix = "default";
 
-
-    if (!prompt_output_file || !tensor_output_file) {
-        std::cerr << "❌ Failed to open output files.\n";
-        return 1;
-    }
-
     callback_data cb_data;
     common_params params;
     bool list_layers = false;

From 6f316002899d867d18931b17a2fd6284f40757d9 Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Sat, 6 Sep 2025 07:28:03 +0200
Subject: [PATCH 09/10] Feature: save data from multiple layers during
 inference

---
 examples/eval-callback/eval-callback.cpp | 172 ++++++++++++++++++-----
 1 file changed, 134 insertions(+), 38 deletions(-)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 848d948415a0c..07578feb0edc9 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -13,16 +13,64 @@
 
 #include <fstream>
 
+#include <unordered_map>
+#include <memory>
+#include <filesystem>
+#include <unordered_set>
+
 std::ofstream prompt_output_file;
 std::ofstream tensor_output_file;
 
 
+// sanitize names like "blk.0.output" -> "blk_0_output"
+static std::string sanitize(const std::string &s) {
+    std::string out = s;
+    for (char &c : out) {
+        if (c == '/' || c == '\\' || c == ' ' || c == ':' || c == '.' ) c = '_';
+    }
+    return out;
+}
 struct callback_data {
     std::vector<uint8_t> data;
-    std::string parse_layer_name;
-    int current_token_index = -1;
+
+    std::unordered_set<std::string> exact_targets;
+    std::vector<std::string>        prefix_targets;
+
+    int  current_token_index = -1;
+    bool list_mode = false;
+
+    // NEW: per-tensor streams + base directory
+    std::string base_dir; // e.g., "<output_prefix>/tensors"
+    std::unordered_map<std::string, std::unique_ptr<std::ofstream>> streams;
 };
 
+static bool matches_target(const std::string &name, const callback_data *cb) {
+    if (cb->exact_targets.find(name) != cb->exact_targets.end()) return true;
+    for (const auto &pref : cb->prefix_targets) {
+        if (name.rfind(pref, 0) == 0) return true; // starts_with
+    }
+    return false;
+}
+
+
+static std::ostream & get_stream_for(const std::string &name, callback_data *cb) {
+    auto it = cb->streams.find(name);
+    if (it != cb->streams.end()) return *it->second;
+
+    const std::string fname = cb->base_dir + "/" + sanitize(name) + ".txt";
+    auto ofs = std::make_unique<std::ofstream>(fname, std::ios::app);
+    if (!ofs->is_open()) {
+        // fall back to global file if something goes wrong
+        return tensor_output_file;
+    }
+    std::ostream &ref = *ofs;
+    cb->streams.emplace(name, std::move(ofs));
+    return ref;
+}
+
+
+
+
 static std::string ggml_ne_string(const ggml_tensor * t) {
     std::string str;
     for (int i = 0; i < GGML_MAX_DIMS; ++i) {
@@ -34,12 +82,17 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
     return str;
 }
 
-static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t token_idx) {
+static void ggml_print_tensor_block(std::ostream &os,
+                                    const std::string& tensor_name,
+                                    uint8_t * data, ggml_type type,
+                                    const int64_t * ne, const size_t * nb,
+                                    int64_t token_idx) {
     const int64_t dim = ne[0];
-    tensor_output_file << "=== TOKEN " << token_idx << " ===\n";
-    tensor_output_file << "--- TENSOR: " << tensor_name << " ---\n";
-    tensor_output_file << "SHAPE: [" << dim << "]\n";
-    tensor_output_file << "DATA:\n";
+
+    os << "=== TOKEN " << token_idx << " ===\n";
+    os << "--- TENSOR: " << tensor_name << " ---\n";
+    os << "SHAPE: [" << dim << "]\n";
+    os << "DATA:\n";
 
     for (int64_t i = 0; i < dim; ++i) {
         size_t offset = i * nb[0];
@@ -51,44 +104,52 @@ static void ggml_print_tensor_block(const std::string& tensor_name, uint8_t * da
             default: GGML_ABORT("Unsupported tensor type");
         }
 
-        tensor_output_file << v;
-        if (i < dim - 1) tensor_output_file << ", ";
+        os << v;
+        if (i < dim - 1) os << ", ";
     }
-
-    tensor_output_file << "\n\n";
+    os << "\n\n";
 }
 
 static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (callback_data *) user_data;
+    auto * cb = (callback_data *) user_data;
+    const std::string name = t->name;
 
     if (ask) {
-        if (cb_data->parse_layer_name == "__LIST__") {
-            tensor_output_file << t->name << "\n";
+        if (cb->list_mode) {
+            // print once per tensor name, return false so we don't hook/copy data
+            static std::unordered_set<std::string> printed;
+            if (printed.insert(name).second) {
+                tensor_output_file << name << "\n";
+            }
             return false;
         }
-        return std::string(t->name) == cb_data->parse_layer_name;
+        // normal (non-list) mode: only hook matches
+        return matches_target(name, cb);
     }
 
-    if (std::string(t->name) != cb_data->parse_layer_name) {
+    if (cb->list_mode) {
+        // we already printed in the ask branch
         return false;
     }
 
-    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+    if (!matches_target(name, cb)) return false;
 
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
     if (!is_host) {
         auto n_bytes = ggml_nbytes(t);
-        cb_data->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+        cb->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb->data.data(), 0, n_bytes);
     }
-
     if (!ggml_is_quantized(t->type)) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor_block(t->name, data, t->type, t->ne, t->nb, cb_data->current_token_index);
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb->data.data();
+        std::ostream &os = get_stream_for(name, cb);
+        ggml_print_tensor_block(os, name, data, t->type, t->ne, t->nb, cb->current_token_index);
+        os.flush();
     }
-
     return true;
 }
 
+
 static bool run(llama_context * ctx, const common_params & params, callback_data & cb_data) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
@@ -152,14 +213,14 @@ int main(int argc, char **argv) {
     common_params params;
     bool list_layers = false;
     std::string list_layers_filter = "";
-    std::string parse_layer_value;
+    std::vector<std::string> parse_layer_values; // multi or comma-separated
     std::vector<char*> filtered_argv;
     std::vector<std::string> prompts;
 
     filtered_argv.push_back(argv[0]);
     params.n_gpu_layers = 20;
 
-
+    // --------- ARG PARSING ---------
     for (int i = 1; i < argc; i++) {
         std::string arg = argv[i];
         if (arg.compare(0, 2, "--") == 0) {
@@ -168,7 +229,16 @@ int main(int argc, char **argv) {
 
         if (arg == "--parse-layer") {
             if (i + 1 < argc) {
-                parse_layer_value = argv[++i];
+                std::string raw = argv[++i];
+                // allow comma-separated list
+                size_t start = 0;
+                while (true) {
+                    size_t pos = raw.find(',', start);
+                    std::string item = raw.substr(start, pos - start);
+                    if (!item.empty()) parse_layer_values.push_back(item);
+                    if (pos == std::string::npos) break;
+                    start = pos + 1;
+                }
             } else {
                 fprintf(stderr, "error: --parse-layer requires an argument\n");
                 return 1;
@@ -205,7 +275,7 @@ int main(int argc, char **argv) {
         } else if (arg == "--list-layers") {
             list_layers = true;
             if (i + 1 < argc && argv[i + 1][0] != '-') {
-                list_layers_filter = argv[++i];  // take optional argument
+                list_layers_filter = argv[++i];  // optional filter (unused below)
             }
             continue;
         }
@@ -213,22 +283,46 @@ int main(int argc, char **argv) {
         filtered_argv.push_back(argv[i]);
     }
 
+    // open standard outputs
     prompt_output_file.open(output_prefix + "_prompt_output.txt");
     tensor_output_file.open(output_prefix + "_tensor_output.txt");
-
     if (!prompt_output_file || !tensor_output_file) {
         std::cerr << "❌ Failed to open output files.\n";
         return 1;
     }
 
+    // create tensors dir AFTER we know output_prefix
+    try {
+        std::filesystem::create_directories(output_prefix + "/tensors");
+    } catch (const std::exception &e) {
+        std::cerr << "❌ Failed to create tensors directory: " << e.what() << "\n";
+        return 1;
+    }
+    cb_data.base_dir = output_prefix + "/tensors";
+
     if (!common_params_parse((int)filtered_argv.size(), filtered_argv.data(), params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
-    if (!parse_layer_value.empty()) {
-        LOG_INF("Parse layer argument value: %s\n", parse_layer_value.c_str());
+    // configure selector sets
+    if (list_layers) {
+        cb_data.list_mode = true;
+    } else {
+        if (parse_layer_values.empty()) {
+            // sensible default (keeps legacy behavior)
+            cb_data.exact_targets.insert("l_out-31");
+        } else {
+            for (auto s : parse_layer_values) {
+                if (s == "__LIST__") { cb_data.list_mode = true; continue; }
+                if (!s.empty() && s.back() == '*') {
+                    s.pop_back(); // treat trailing * as prefix
+                    if (!s.empty()) cb_data.prefix_targets.push_back(s);
+                } else {
+                    cb_data.exact_targets.insert(s);
+                }
+            }
+        }
     }
-    cb_data.parse_layer_name = parse_layer_value;
 
     common_init();
     llama_backend_init();
@@ -238,7 +332,6 @@ int main(int argc, char **argv) {
     params.cb_eval_user_data = &cb_data;
     params.warmup = false;
 
-
     common_init_result llama_init = common_init_from_params(params);
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
@@ -253,11 +346,10 @@ int main(int argc, char **argv) {
     LOG_INF("\n");
 
     if (prompts.empty()) {
-        prompts.emplace_back("What is the capital of France?");  // Fallback default
+        prompts.emplace_back("What is the capital of France?");  // fallback
     }
 
-    if (list_layers) {
-        cb_data.parse_layer_name = "__LIST__";
+    if (cb_data.list_mode) {
         params.n_predict = 1;
         params.prompt = "dummy";  // any valid prompt to trigger eval
 
@@ -267,9 +359,11 @@ int main(int argc, char **argv) {
         }
         prompt_output_file.close();
         tensor_output_file.close();
-
+        // close any opened per-tensor streams
+        for (auto &kv : cb_data.streams) {
+            if (kv.second && kv.second->is_open()) kv.second->close();
+        }
         return 0;
-
     }
 
     for (const auto& prompt : prompts) {
@@ -287,6 +381,8 @@ int main(int argc, char **argv) {
     llama_backend_free();
     prompt_output_file.close();
     tensor_output_file.close();
-
+    for (auto &kv : cb_data.streams) {
+        if (kv.second && kv.second->is_open()) kv.second->close();
+    }
     return 0;
 }

From 11a750518ac7905c188dc88a56587031c2c0b7ac Mon Sep 17 00:00:00 2001
From: "casper.dert" <casper.dert@assetcare.nl>
Date: Sat, 6 Sep 2025 10:58:39 +0200
Subject: [PATCH 10/10] Feature: save data from multiple layers during
 inference

---
 examples/eval-callback/eval-callback.cpp | 205 ++++++++++++++++-------
 1 file changed, 141 insertions(+), 64 deletions(-)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 07578feb0edc9..a402baeb357f1 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -10,6 +10,7 @@
 #include <string>
 #include <vector>
 #include <iostream>
+#include "ggml-backend.h"
 
 #include <fstream>
 
@@ -44,6 +45,12 @@ struct callback_data {
     std::unordered_map<std::string, std::unique_ptr<std::ofstream>> streams;
 };
 
+struct sampling_cfg {
+    int   top_k   = -1;    // <1 = disabled
+    float top_p   = 1.0f;  // >=1 = disabled
+    float temp    = 1.0f;  // we will always apply temperature (min clamp)
+};
+
 static bool matches_target(const std::string &name, const callback_data *cb) {
     if (cb->exact_targets.find(name) != cb->exact_targets.end()) return true;
     for (const auto &pref : cb->prefix_targets) {
@@ -150,17 +157,38 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 
 
-static bool run(llama_context * ctx, const common_params & params, callback_data & cb_data) {
+static bool run(llama_context * ctx,
+                const common_params & params,
+                const sampling_cfg & samp,
+                callback_data & cb_data){
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
     const bool add_bos = llama_vocab_get_add_bos(vocab);
 
     std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
 
-    auto sparams = llama_sampler_chain_default_params();
-    sparams.no_perf = false;
-    llama_sampler * sampler = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(sampler, llama_sampler_init_greedy());
+    auto chain_params = llama_sampler_chain_default_params();
+    chain_params.no_perf = false;
+    llama_sampler * sampler = llama_sampler_chain_init(chain_params);
+
+    // Always apply provided temperature (clamped to >0 above)
+    llama_sampler_chain_add(sampler, llama_sampler_init_temp(samp.temp));
+
+    // Optional: top-k
+    if (samp.top_k > 0) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(samp.top_k));
+    }
+
+    // Optional: top-p
+    if (samp.top_p < 1.0f) {
+        // min_keep = 1 is sane
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(samp.top_p, 1));
+    }
+
+    // Add RNG distribution so temp/top-k/top-p actually randomize
+    uint32_t seed = (uint32_t) ggml_time_us();
+    llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed));
+
 
     llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
     cb_data.current_token_index = -1;
@@ -210,6 +238,8 @@ int main(int argc, char **argv) {
     std::string output_prefix = "default";
 
     callback_data cb_data;
+    sampling_cfg samp;    // <-- add this
+
     common_params params;
     bool list_layers = false;
     std::string list_layers_filter = "";
@@ -220,69 +250,116 @@ int main(int argc, char **argv) {
     filtered_argv.push_back(argv[0]);
     params.n_gpu_layers = 20;
 
-    // --------- ARG PARSING ---------
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-        if (arg.compare(0, 2, "--") == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
+// --------- ARG PARSING ---------
+for (int i = 1; i < argc; i++) {
+    std::string arg = argv[i];
+    if (arg.compare(0, 2, "--") == 0) {
+        std::replace(arg.begin(), arg.end(), '_', '-');
+    }
 
-        if (arg == "--parse-layer") {
-            if (i + 1 < argc) {
-                std::string raw = argv[++i];
-                // allow comma-separated list
-                size_t start = 0;
-                while (true) {
-                    size_t pos = raw.find(',', start);
-                    std::string item = raw.substr(start, pos - start);
-                    if (!item.empty()) parse_layer_values.push_back(item);
-                    if (pos == std::string::npos) break;
-                    start = pos + 1;
-                }
-            } else {
-                fprintf(stderr, "error: --parse-layer requires an argument\n");
-                return 1;
-            }
-            continue;
-
-        } else if (arg == "--prompt") {
-            if (i + 1 < argc) {
-                prompts.emplace_back(argv[++i]);
-            } else {
-                fprintf(stderr, "error: --prompt requires an argument\n");
-                return 1;
+    // --parse-layer <a,b,c>
+    if (arg == "--parse-layer") {
+        if (i + 1 < argc) {
+            std::string raw = argv[++i];
+            size_t start = 0;
+            while (true) {
+                size_t pos  = raw.find(',', start);
+                std::string item = raw.substr(start, pos - start);
+                if (!item.empty()) parse_layer_values.push_back(item);
+                if (pos == std::string::npos) break;
+                start = pos + 1;
             }
-            continue;
-
-        } else if (arg == "--output-prefix") {
-            if (i + 1 < argc) {
-                output_prefix = argv[++i];
-            } else {
-                fprintf(stderr, "error: --output-prefix requires a string argument\n");
-                return 1;
-            }
-            continue;
-
-        } else if (arg == "--n-gpu-layers") {
-            if (i + 1 < argc) {
-                params.n_gpu_layers = std::stoi(argv[++i]);  // override default
-            } else {
-                fprintf(stderr, "error: --n-gpu-layers requires an integer argument\n");
-                return 1;
-            }
-            continue;
+        } else {
+            fprintf(stderr, "error: --parse-layer requires an argument\n");
+            return 1;
+        }
+        continue;
+    }
 
-        } else if (arg == "--list-layers") {
-            list_layers = true;
-            if (i + 1 < argc && argv[i + 1][0] != '-') {
-                list_layers_filter = argv[++i];  // optional filter (unused below)
-            }
-            continue;
+    // --prompt "..."
+    if (arg == "--prompt") {
+        if (i + 1 < argc) {
+            prompts.emplace_back(argv[++i]);
+        } else {
+            fprintf(stderr, "error: --prompt requires an argument\n");
+            return 1;
+        }
+        continue;
+    }
+
+    // --top-k N
+    if (arg == "--top-k") {
+        if (i + 1 < argc) {
+            samp.top_k = std::stoi(argv[++i]);
+            if (samp.top_k < 1) samp.top_k = -1;   // disable if <1
+        } else {
+            fprintf(stderr, "error: --top-k requires an int\n");
+            return 1;
         }
+        continue;
+    }
+
+    // --top-p F
+    if (arg == "--top-p") {
+        if (i + 1 < argc) {
+            samp.top_p = std::stof(argv[++i]);
+            if (samp.top_p <= 0.0f) samp.top_p = 1.0f;
+            if (samp.top_p > 1.0f)  samp.top_p = 1.0f; // clamp
+        } else {
+            fprintf(stderr, "error: --top-p requires a float\n");
+            return 1;
+        }
+        continue;
+    }
 
-        filtered_argv.push_back(argv[i]);
+    // --temp F   (or --temperature F)
+    if (arg == "--temp" || arg == "--temperature") {
+        if (i + 1 < argc) {
+            samp.temp = std::stof(argv[++i]);
+            if (samp.temp <= 0.0f) samp.temp = 1e-6f; // avoid greedy (force >0)
+        } else {
+            fprintf(stderr, "error: --temperature requires a float\n");
+            return 1;
+        }
+        continue;
     }
 
+    // --output-prefix STR
+    if (arg == "--output-prefix") {
+        if (i + 1 < argc) {
+            output_prefix = argv[++i];
+        } else {
+            fprintf(stderr, "error: --output-prefix requires a string argument\n");
+            return 1;
+        }
+        continue;
+    }
+
+    // --n-gpu-layers N
+    if (arg == "--n-gpu-layers") {
+        if (i + 1 < argc) {
+            params.n_gpu_layers = std::stoi(argv[++i]);
+        } else {
+            fprintf(stderr, "error: --n-gpu-layers requires an integer argument\n");
+            return 1;
+        }
+        continue;
+    }
+
+    // --list-layers [optional_filter]
+    if (arg == "--list-layers") {
+        list_layers = true;
+        if (i + 1 < argc && argv[i + 1][0] != '-') {
+            list_layers_filter = argv[++i];  // optional, currently unused
+        }
+        continue;
+    }
+
+    // Unrecognized flag/arg: pass through to common_params_parse
+    filtered_argv.push_back(argv[i]);
+}
+
+
     // open standard outputs
     prompt_output_file.open(output_prefix + "_prompt_output.txt");
     tensor_output_file.open(output_prefix + "_tensor_output.txt");
@@ -353,7 +430,7 @@ int main(int argc, char **argv) {
         params.n_predict = 1;
         params.prompt = "dummy";  // any valid prompt to trigger eval
 
-        if (!run(ctx, params, cb_data)) {
+        if (!run(ctx, params, samp, cb_data)) {
             LOG_ERR("Failed during layer listing run\n");
             return 1;
         }
@@ -369,13 +446,13 @@ int main(int argc, char **argv) {
     for (const auto& prompt : prompts) {
         prompt_output_file << "Running prompt: " << prompt << "\n";
         params.prompt = prompt;
-        if (!run(ctx, params, cb_data)) {
+        if (!run(ctx, params, samp, cb_data)) {
             LOG_ERR("Failed on prompt: %s\n", prompt.c_str());
             return 1;
         }
     }
 
-    LOG("\n");
+    LOG_INF("\n");
     llama_perf_context_print(ctx);
 
     llama_backend_free();