diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index cbcbfcee861ee..b86093630c392 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -21,6 +21,8 @@ #include "common.h" #include "ggml.h" #include "llama.h" +#include "llama-context.h" +#include #ifdef _WIN32 # define WIN32_LEAN_AND_MEAN @@ -879,6 +881,7 @@ struct test { const std::string cpu_info; const std::string gpu_info; std::string model_filename; + int n_graph_splits; std::string model_type; uint64_t model_size; uint64_t model_n_params; @@ -936,7 +939,7 @@ struct test { std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); test_time = buf; - (void) ctx; + n_graph_splits = ctx->get_graph_splits(); } uint64_t avg_ns() const { return ::avg(samples_ns); } @@ -970,11 +973,11 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", - "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "n_graph_splits", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", + "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", + "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", + "use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", + "stddev_ns", "avg_ts", "stddev_ts", }; return fields; } @@ -985,7 +988,7 @@ struct test { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || - field == "stddev_ns") { + field == "stddev_ns" || field == "n_graph_splits") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1020,6 +1023,7 @@ struct test { gpu_info, get_backend(), model_filename, + std::to_string(n_graph_splits), model_type, std::to_string(model_size), std::to_string(model_n_params), @@ -1196,6 +1200,9 @@ struct markdown_printer : public printer { if (field == "n_gpu_layers") { return 3; } + if (field == "n_graph_splits") { + return 3; + } if (field == "n_threads") { return 7; } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 983385f86d494..cb4802ca9b195 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2216,6 +2216,13 @@ void llama_context::perf_reset() { t_p_eval_us = n_p_eval = 0; } +/** + * @brief Get the number of graph splits. + */ +int llama_context::get_graph_splits() const{ + return ggml_backend_sched_get_n_splits(sched.get()); +} + // // interface implementation // diff --git a/src/llama-context.h b/src/llama-context.h index 04facb544cb1a..39783132d72c3 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -128,6 +128,8 @@ struct llama_context { llama_perf_context_data perf_get_data() const; void perf_reset(); + int get_graph_splits() const; + private: // // output