11#include " llama-context.h"
22
3+ #include < chrono>
34#include < cinttypes>
45#include < cstring>
6+ #include < mutex>
57#include < stdexcept>
68
79#include " ../ggml/include/ggml-backend.h"
1214#include " llama-mmap.h"
1315#include " llama-model.h"
1416
17+ namespace test {
18+
19+ // from
20+ // https://stackoverflow.com/questions/16337610/how-to-know-if-a-type-is-a-specialization-of-stdvector
21+ template <typename , template <typename ...> typename > constexpr bool is_specialization_v = false ;
22+
23+ template <template <typename ...> typename value_type, typename ... arg_types>
24+ constexpr bool is_specialization_v<value_type<arg_types...>, value_type> = true ;
25+
26+ template <typename value_type> concept time_type = is_specialization_v<value_type, std::chrono::duration>;
27+
28+ template <time_type value_type = std::chrono::nanoseconds> class stop_watch {
29+ public:
30+ using hr_clock = std::conditional_t <std::chrono::high_resolution_clock::is_steady,
31+ std::chrono::high_resolution_clock, std::chrono::steady_clock>;
32+ static constexpr bool lock_free{ std::atomic<value_type>::is_always_lock_free };
33+ using time_type = std::conditional_t <lock_free, value_type, uint64_t >;
34+
35+ stop_watch (uint64_t newTime) noexcept { total_time_units.store (time_type{ newTime }, std::memory_order_release); }
36+
37+ stop_watch & operator =(stop_watch && other) noexcept {
38+ if (this != &other) {
39+ total_time_units.store (other.total_time_units .load (std::memory_order_acquire), std::memory_order_release);
40+ start_time_units.store (other.start_time_units .load (std::memory_order_acquire), std::memory_order_release);
41+ }
42+ return *this ;
43+ }
44+
45+ stop_watch (stop_watch && other) noexcept { *this = std::move (other); }
46+
47+ stop_watch & operator =(const stop_watch & other) noexcept {
48+ if (this != &other) {
49+ total_time_units.store (other.total_time_units .load (std::memory_order_acquire), std::memory_order_release);
50+ start_time_units.store (other.start_time_units .load (std::memory_order_acquire), std::memory_order_release);
51+ }
52+ return *this ;
53+ }
54+
55+ stop_watch (const stop_watch & other) noexcept { *this = other; }
56+
57+ bool has_time_elapsed () noexcept {
58+ return ((get_current_time () - start_time_units.load (std::memory_order_acquire)) >=
59+ total_time_units.load (std::memory_order_acquire));
60+ }
61+
62+ void add_time () noexcept {
63+ // std::unique_lock lock{ mutex };
64+ values.emplace_back (total_time_elapsed ());
65+ // lock.release();
66+ reset ();
67+ }
68+
69+ uint64_t get_count () noexcept { return values.size (); }
70+
71+ uint64_t get_average (time_type newTimeValue = time_type{}) noexcept {
72+ std::unique_lock lock{ mutex };
73+ uint64_t total_time{};
74+ for (auto & value : values) {
75+ total_time += get_value_as_uint (value);
76+ }
77+ return total_time / ((values.size () > 0 ) ? values.size () : 1 );
78+ }
79+
80+ void reset (time_type newTimeValue = time_type{}) noexcept {
81+ if (newTimeValue != time_type{}) {
82+ total_time_units.store (newTimeValue, std::memory_order_release);
83+ }
84+ start_time_units.store (get_current_time (), std::memory_order_release);
85+ }
86+
87+ uint64_t get_total_wait_time () const noexcept {
88+ return get_value_as_uint (total_time_units.load (std::memory_order_acquire));
89+ }
90+
91+ time_type total_time_elapsed () noexcept {
92+ return get_current_time () - start_time_units.load (std::memory_order_acquire);
93+ }
94+
95+ uint64_t total_time_elapsed_uint64 () noexcept {
96+ return get_value_as_uint (get_current_time ()) -
97+ get_value_as_uint (start_time_units.load (std::memory_order_acquire));
98+ }
99+
100+ protected:
101+ std::atomic<time_type> total_time_units{};
102+ std::atomic<time_type> start_time_units{};
103+ std::vector<time_type> values{};
104+ std::mutex mutex{};
105+
106+ time_type get_current_time () {
107+ if constexpr (lock_free) {
108+ return std::chrono::duration_cast<value_type>(hr_clock::now ().time_since_epoch ());
109+ } else {
110+ return std::chrono::duration_cast<value_type>(hr_clock::now ().time_since_epoch ()).count ();
111+ }
112+ }
113+
114+ uint64_t get_value_as_uint (time_type time) {
115+ if constexpr (lock_free) {
116+ return time.count ();
117+ } else {
118+ return time;
119+ }
120+ }
121+ };
122+ } // namespace test
123+
15124//
16125// llama_context
17126//
@@ -895,7 +1004,7 @@ std::string format_dimensions(const int64_t * ne, int max_dims = GGML_MAX_DIMS)
8951004}
8961005
8971006// Main pretty print function
898- void pretty_print_tensor (const struct ggml_tensor * tensor, std::ostream & os) {
1007+ void pretty_print_tensor (const struct ggml_tensor * tensor, std::ostream & os, bool input = false ) {
8991008 if (!tensor) {
9001009 os << " NULL tensor\n " ;
9011010 return ;
@@ -908,19 +1017,36 @@ void pretty_print_tensor(const struct ggml_tensor * tensor, std::ostream & os) {
9081017 } else {
9091018 tensor_name = " <unnamed>" ;
9101019 }
911-
1020+ std::string tab{};
1021+ if (input) {
1022+ tab = " " ;
1023+ }
9121024 // Format output with nice alignment
9131025 const int label_width = 12 ;
914-
915- os << " ────────────────────────────────────────\n " ;
916- os << " " << std::left << std::setw (37 ) << (" Tensor: " + tensor_name) << " │\n " ;
917- os << " ────────────────────────────────────────\n " ;
918- os << " " << std::left << std::setw (label_width) << " Type:" << std::left << std::setw (24 )
1026+ if (!input) {
1027+ os << " ────────────────────────────────────────\n " ;
1028+ os << tab << std::left << std::setw (37 ) << (" Tensor: " + tensor_name) << " │\n " ;
1029+ os << " ────────────────────────────────────────\n " ;
1030+ } else {
1031+ os << tab << std::left << std::setw (37 ) << (" Tensor: " + tensor_name) << " │\n " ;
1032+ }
1033+ os << tab << std::left << std::setw (label_width) << " Type:" << std::left << std::setw (24 )
9191034 << ggml_type_name (tensor->type ) << " \n " ;
920- os << " " << std::left << std::setw (label_width) << " Dimensions:" << std::left << std::setw (24 )
1035+ os << tab << std::left << std::setw (label_width) << " Dimensions:" << std::left << std::setw (24 )
9211036 << format_dimensions (tensor->ne ) << " \n " ;
922- os << " " << std::left << std::setw (label_width) << " Operation:" << std::left << std::setw (24 )
923- << ggml_op_name (tensor->op ) << " \n " ;
1037+ if (!input) {
1038+ os << " " << std::left << std::setw (label_width) << " Operation:" << std::left << std::setw (24 )
1039+ << ggml_op_name (tensor->op ) << " \n " ;
1040+ os << " " << std::left << std::setw (label_width) << " Inputs:" ;
1041+ size_t input{};
1042+ for (ggml_tensor * const * tensor_new = tensor->src ; *tensor_new; ++tensor_new) {
1043+ ++input;
1044+ }
1045+ os << " " << std::left << std::setw (label_width) << std::to_string (input) << " \n " ;
1046+ for (ggml_tensor * const * tensor_new = tensor->src ; *tensor_new; ++tensor_new) {
1047+ pretty_print_tensor (*tensor_new, os, true );
1048+ }
1049+ }
9241050
9251051 // Calculate total elements
9261052 int64_t total_elements = 1 ;
@@ -929,12 +1055,16 @@ void pretty_print_tensor(const struct ggml_tensor * tensor, std::ostream & os) {
9291055 total_elements *= tensor->ne [i];
9301056 }
9311057 }
932- os << " " << std::left << std::setw (label_width) << " Elements:" << std::left << std::setw (24 ) << total_elements
933- << " \n " ;
1058+ if (!input) {
1059+ os << " " << std::left << std::setw (label_width) << " Elements:" << std::left << std::setw (24 ) << total_elements
1060+ << " \n " ;
9341061
935- os << " ─────────────────────────────────────────\n " ;
1062+ os << " ─────────────────────────────────────────\n " ;
1063+ }
9361064}
9371065
1066+ test::stop_watch stop_watch_val{ 0 };
1067+
9381068bool save_string_to_file (const std::string & content, const std::string & filename) {
9391069 std::ofstream file (filename);
9401070 if (!file.is_open ()) {
@@ -1074,17 +1204,21 @@ int llama_context::decode(llama_batch & inp_batch) {
10741204 // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
10751205
10761206 ggml_backend_sched_alloc_graph (sched.get (), gf);
1077- std::stringstream stream{};
1078- for (size_t x = 0 ; x < gf->n_leafs ; ++x) {
1079- pretty_print_tensor (gf->leafs [x], stream);
1080- }
1081- for (size_t x = 0 ; x < gf->n_nodes ; ++x) {
1082- pretty_print_tensor (gf->nodes [x], stream);
1083- }
1084- save_string_to_file (stream.str (), " ../../../../../TensorData.txt" );
1207+ // std::stringstream stream{};
1208+ // for (size_t x = 0; x < gf->n_leafs; ++x) {
1209+ // pretty_print_tensor(gf->leafs[x], stream);
1210+ // }
1211+ // for (size_t x = 0; x < gf->n_nodes; ++x) {
1212+ // pretty_print_tensor(gf->nodes[x], stream);
1213+ // }
1214+ // save_string_to_file(stream.str(), "../../../../../TensorData.txt");
10851215 res->set_inputs (&ubatch);
1086-
1216+ stop_watch_val. reset ();
10871217 const auto compute_status = graph_compute (gf, ubatch.n_tokens > 1 );
1218+ stop_watch_val.add_time ();
1219+ std::cout << " LLAMA.CPP/GGML AVERAGE COMPUTE TIME, OVER: "
1220+ << std::setw (50 - std::size (" LLAMA.CPP/GGML AVERAGE COMPUTE TIME, OVER: " ))
1221+ << stop_watch_val.get_count () << " TOKENS: " << stop_watch_val.get_average () << std::endl;
10881222 if (compute_status != GGML_STATUS_SUCCESS) {
10891223 switch (compute_status) {
10901224 case GGML_STATUS_ABORTED:
0 commit comments