ggml-org · bachelor-dou · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 18, 2025
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -21,6 +21,8 @@
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#include "llama-context.h"
+#include <filesystem>
 
 #ifdef _WIN32
 #    define WIN32_LEAN_AND_MEAN
@@ -879,6 +881,7 @@ struct test {
     const std::string        cpu_info;
     const std::string        gpu_info;
     std::string              model_filename;
+    int                      n_graph_splits;
     std::string              model_type;
     uint64_t                 model_size;
     uint64_t                 model_n_params;
@@ -936,7 +939,7 @@ struct test {
         std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
         test_time = buf;
 
-        (void) ctx;
+        n_graph_splits = ctx->get_graph_splits();
     }
 
     uint64_t avg_ns() const { return ::avg(samples_ns); }
@@ -970,11 +973,11 @@ struct test {
     static const std::vector<std::string> & get_fields() {
         static const std::vector<std::string> fields = {
             "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
-            "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
-            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
-            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
-            "embeddings",   "n_prompt",     "n_gen",          "test_time",  "avg_ns",       "stddev_ns",
-            "avg_ts",       "stddev_ts",
+            "n_graph_splits", "model_type",  "model_size",   "model_n_params", "n_batch",    "n_ubatch",
+            "n_threads",     "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",
+            "n_gpu_layers",  "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split",
+            "use_mmap",       "embeddings",   "n_prompt",     "n_gen",          "test_time",  "avg_ns",
+            "stddev_ns",      "avg_ts",       "stddev_ts",
         };
         return fields;
     }
@@ -985,7 +988,7 @@ struct test {
         if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
             field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
             field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
-            field == "stddev_ns") {
+            field == "stddev_ns" || field == "n_graph_splits") {
             return INT;
         }
         if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@@ -1020,6 +1023,7 @@ struct test {
                                             gpu_info,
                                             get_backend(),
                                             model_filename,
+                                            std::to_string(n_graph_splits),
                                             model_type,
                                             std::to_string(model_size),
                                             std::to_string(model_n_params),
@@ -1196,6 +1200,9 @@ struct markdown_printer : public printer {
         if (field == "n_gpu_layers") {
             return 3;
         }
+        if (field == "n_graph_splits") {
+            return 3;
+        }
         if (field == "n_threads") {
             return 7;
         }

@@ -2216,6 +2216,13 @@ void llama_context::perf_reset() {
     t_p_eval_us = n_p_eval = 0;
 }
 
+/**
+ * @brief Get the number of graph splits.
+ */
+int llama_context::get_graph_splits() const{
+    return ggml_backend_sched_get_n_splits(sched.get());
+}
+
 //
 // interface implementation
 //

@@ -128,6 +128,8 @@ struct llama_context {
     llama_perf_context_data perf_get_data() const;
     void perf_reset();
 
+    int get_graph_splits() const;
+
 private:
     //
     // output