11#include " llama-context.h"
22
3+ #include < chrono>
34#include < cinttypes>
45#include < cstring>
6+ #include < mutex>
57#include < stdexcept>
68
79#include " ../ggml/include/ggml-backend.h"
1214#include " llama-mmap.h"
1315#include " llama-model.h"
1416
17+ // from
18+ // https://stackoverflow.com/questions/16337610/how-to-know-if-a-type-is-a-specialization-of-stdvector
19+ template <typename , template <typename ...> typename > constexpr bool is_specialization_v = false ;
20+
21+ template <template <typename ...> typename value_type, typename ... arg_types>
22+ constexpr bool is_specialization_v<value_type<arg_types...>, value_type> = true ;
23+
24+ template <typename value_type> concept time_type = is_specialization_v<value_type, std::chrono::duration>;
25+
26+ template <time_type value_type = std::chrono::nanoseconds> class stop_watch {
27+ public:
28+ using hr_clock = std::conditional_t <std::chrono::high_resolution_clock::is_steady,
29+ std::chrono::high_resolution_clock, std::chrono::steady_clock>;
30+ static constexpr bool lock_free{ std::atomic<value_type>::is_always_lock_free };
31+ using time_type = std::conditional_t <lock_free, value_type, uint64_t >;
32+
33+ stop_watch (uint64_t newTime) noexcept { total_time_units.store (time_type{ newTime }, std::memory_order_release); }
34+
35+ stop_watch & operator =(stop_watch && other) noexcept {
36+ if (this != &other) {
37+ total_time_units.store (other.total_time_units .load (std::memory_order_acquire), std::memory_order_release);
38+ start_time_units.store (other.start_time_units .load (std::memory_order_acquire), std::memory_order_release);
39+ }
40+ return *this ;
41+ }
42+
43+ stop_watch (stop_watch && other) noexcept { *this = std::move (other); }
44+
45+ stop_watch & operator =(const stop_watch & other) noexcept {
46+ if (this != &other) {
47+ total_time_units.store (other.total_time_units .load (std::memory_order_acquire), std::memory_order_release);
48+ start_time_units.store (other.start_time_units .load (std::memory_order_acquire), std::memory_order_release);
49+ }
50+ return *this ;
51+ }
52+
53+ stop_watch (const stop_watch & other) noexcept { *this = other; }
54+
55+ bool has_time_elapsed () noexcept {
56+ return ((get_current_time () - start_time_units.load (std::memory_order_acquire)) >=
57+ total_time_units.load (std::memory_order_acquire));
58+ }
59+
60+ void add_time () noexcept {
61+ std::unique_lock lock{ mutex };
62+ values.emplace_back (total_time_elapsed ());
63+ lock.release ();
64+ reset ();
65+ }
66+
67+ uint64_t get_average (time_type newTimeValue = time_type{}) noexcept {
68+ std::unique_lock lock{ mutex };
69+ uint64_t total_time{};
70+ for (auto & value : values) {
71+ total_time += get_value_as_uint (value);
72+ }
73+ return total_time / ((values.size () > 0 ) ? values.size () : 1 );
74+ }
75+
76+ void reset (time_type newTimeValue = time_type{}) noexcept {
77+ if (newTimeValue != time_type{}) {
78+ total_time_units.store (newTimeValue, std::memory_order_release);
79+ }
80+ start_time_units.store (get_current_time (), std::memory_order_release);
81+ }
82+
83+ uint64_t get_total_wait_time () const noexcept {
84+ return get_value_as_uint (total_time_units.load (std::memory_order_acquire));
85+ }
86+
87+ time_type total_time_elapsed () noexcept {
88+ return get_current_time () - start_time_units.load (std::memory_order_acquire);
89+ }
90+
91+ uint64_t total_time_elapsed_uint64 () noexcept {
92+ return get_value_as_uint (get_current_time ()) -
93+ get_value_as_uint (start_time_units.load (std::memory_order_acquire));
94+ }
95+
96+ protected:
97+ std::atomic<time_type> total_time_units{};
98+ std::atomic<time_type> start_time_units{};
99+ std::vector<time_type> values{};
100+ std::mutex mutex{};
101+
102+ time_type get_current_time () {
103+ if constexpr (lock_free) {
104+ return std::chrono::duration_cast<value_type>(hr_clock::now ().time_since_epoch ());
105+ } else {
106+ return std::chrono::duration_cast<value_type>(hr_clock::now ().time_since_epoch ()).count ();
107+ }
108+ }
109+
110+ uint64_t get_value_as_uint (time_type time) {
111+ if constexpr (lock_free) {
112+ return time.count ();
113+ } else {
114+ return time;
115+ }
116+ }
117+ };
118+
15119//
16120// llama_context
17121//
@@ -895,7 +999,7 @@ std::string format_dimensions(const int64_t * ne, int max_dims = GGML_MAX_DIMS)
895999}
8961000
8971001// Main pretty print function
898- void pretty_print_tensor (const struct ggml_tensor * tensor, std::ostream & os) {
1002+ void pretty_print_tensor (const struct ggml_tensor * tensor, std::ostream & os, bool input = false ) {
8991003 if (!tensor) {
9001004 os << " NULL tensor\n " ;
9011005 return ;
@@ -908,19 +1012,36 @@ void pretty_print_tensor(const struct ggml_tensor * tensor, std::ostream & os) {
9081012 } else {
9091013 tensor_name = " <unnamed>" ;
9101014 }
911-
1015+ std::string tab{};
1016+ if (input) {
1017+ tab = " " ;
1018+ }
9121019 // Format output with nice alignment
9131020 const int label_width = 12 ;
914-
915- os << " ────────────────────────────────────────\n " ;
916- os << " " << std::left << std::setw (37 ) << (" Tensor: " + tensor_name) << " │\n " ;
917- os << " ────────────────────────────────────────\n " ;
918- os << " " << std::left << std::setw (label_width) << " Type:" << std::left << std::setw (24 )
1021+ if (!input) {
1022+ os << " ────────────────────────────────────────\n " ;
1023+ os << tab << std::left << std::setw (37 ) << (" Tensor: " + tensor_name) << " │\n " ;
1024+ os << " ────────────────────────────────────────\n " ;
1025+ } else {
1026+ os << tab << std::left << std::setw (37 ) << (" Tensor: " + tensor_name) << " │\n " ;
1027+ }
1028+ os << tab << std::left << std::setw (label_width) << " Type:" << std::left << std::setw (24 )
9191029 << ggml_type_name (tensor->type ) << " \n " ;
920- os << " " << std::left << std::setw (label_width) << " Dimensions:" << std::left << std::setw (24 )
1030+ os << tab << std::left << std::setw (label_width) << " Dimensions:" << std::left << std::setw (24 )
9211031 << format_dimensions (tensor->ne ) << " \n " ;
922- os << " " << std::left << std::setw (label_width) << " Operation:" << std::left << std::setw (24 )
923- << ggml_op_name (tensor->op ) << " \n " ;
1032+ if (!input) {
1033+ os << " " << std::left << std::setw (label_width) << " Operation:" << std::left << std::setw (24 )
1034+ << ggml_op_name (tensor->op ) << " \n " ;
1035+ os << " " << std::left << std::setw (label_width) << " Inputs:" ;
1036+ size_t input{};
1037+ for (ggml_tensor * const * tensor_new = tensor->src ; *tensor_new; ++tensor_new) {
1038+ ++input;
1039+ }
1040+ os << " " << std::left << std::setw (label_width) << std::to_string (input) << " \n " ;
1041+ for (ggml_tensor * const * tensor_new = tensor->src ; *tensor_new; ++tensor_new) {
1042+ pretty_print_tensor (*tensor_new, os, true );
1043+ }
1044+ }
9241045
9251046 // Calculate total elements
9261047 int64_t total_elements = 1 ;
@@ -929,12 +1050,16 @@ void pretty_print_tensor(const struct ggml_tensor * tensor, std::ostream & os) {
9291050 total_elements *= tensor->ne [i];
9301051 }
9311052 }
932- os << " " << std::left << std::setw (label_width) << " Elements:" << std::left << std::setw (24 ) << total_elements
933- << " \n " ;
1053+ if (!input) {
1054+ os << " " << std::left << std::setw (label_width) << " Elements:" << std::left << std::setw (24 ) << total_elements
1055+ << " \n " ;
9341056
935- os << " ─────────────────────────────────────────\n " ;
1057+ os << " ─────────────────────────────────────────\n " ;
1058+ }
9361059}
9371060
1061+ stop_watch stop_watch_val{ 0 };
1062+
9381063bool save_string_to_file (const std::string & content, const std::string & filename) {
9391064 std::ofstream file (filename);
9401065 if (!file.is_open ()) {
@@ -1074,17 +1199,18 @@ int llama_context::decode(llama_batch & inp_batch) {
10741199 // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
10751200
10761201 ggml_backend_sched_alloc_graph (sched.get (), gf);
1077- std::stringstream stream{};
1078- for (size_t x = 0 ; x < gf->n_leafs ; ++x) {
1079- pretty_print_tensor (gf->leafs [x], stream);
1080- }
1081- for (size_t x = 0 ; x < gf->n_nodes ; ++x) {
1082- pretty_print_tensor (gf->nodes [x], stream);
1083- }
1084- save_string_to_file (stream.str (), " ../../../../../TensorData.txt" );
1202+ // std::stringstream stream{};
1203+ // for (size_t x = 0; x < gf->n_leafs; ++x) {
1204+ // pretty_print_tensor(gf->leafs[x], stream);
1205+ // }
1206+ // for (size_t x = 0; x < gf->n_nodes; ++x) {
1207+ // pretty_print_tensor(gf->nodes[x], stream);
1208+ // }
1209+ // save_string_to_file(stream.str(), "../../../../../TensorData.txt");
10851210 res->set_inputs (&ubatch);
1086-
1211+ stop_watch_val. reset ();
10871212 const auto compute_status = graph_compute (gf, ubatch.n_tokens > 1 );
1213+ std::cout << " GRAPH COMPUTE TIME: " << stop_watch_val.total_time_elapsed () << std::endl;
10881214 if (compute_status != GGML_STATUS_SUCCESS) {
10891215 switch (compute_status) {
10901216 case GGML_STATUS_ABORTED:
0 commit comments