Skip to content

Commit 7d784e6

Browse files
committed
Updates!
1 parent 5232d6e commit 7d784e6

File tree

3 files changed

+162
-23
lines changed

3 files changed

+162
-23
lines changed

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@ include(CheckIncludeFileCXX)
55
#set(CMAKE_WARN_DEPRECATED YES)
66
set(CMAKE_WARN_UNUSED_CLI YES)
77

8+
include(FetchContent)
9+
FetchContent_Declare(
10+
Jsonifier
11+
GIT_REPOSITORY https://github.com/realtimechris/jsonifier.git
12+
GIT_TAG dev
13+
)
14+
15+
# Also provides "common"
16+
FetchContent_MakeAvailable(Jsonifier)
17+
818
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
919

1020
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
@@ -211,6 +221,8 @@ set(LLAMA_PUBLIC_HEADERS
211221
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
212222
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
213223

224+
target_link_libraries(llama PUBLIC Jsonifier::Jsonifier)
225+
214226
set_target_properties(llama
215227
PROPERTIES
216228
PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")

CMakePresets.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
"binaryDir": "${sourceDir}/build-${presetName}",
99
"cacheVariables": {
1010
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
11-
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
11+
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/..",
12+
"LLAMA_CURL": "OFF"
1213
}
1314
},
1415
{

src/llama-context.cpp

Lines changed: 148 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#include "llama-context.h"
22

3+
#include <chrono>
34
#include <cinttypes>
45
#include <cstring>
6+
#include <mutex>
57
#include <stdexcept>
68

79
#include "../ggml/include/ggml-backend.h"
@@ -12,6 +14,108 @@
1214
#include "llama-mmap.h"
1315
#include "llama-model.h"
1416

17+
// from
18+
// https://stackoverflow.com/questions/16337610/how-to-know-if-a-type-is-a-specialization-of-stdvector
19+
template <typename, template <typename...> typename> constexpr bool is_specialization_v = false;
20+
21+
template <template <typename...> typename value_type, typename... arg_types>
22+
constexpr bool is_specialization_v<value_type<arg_types...>, value_type> = true;
23+
24+
template <typename value_type> concept time_type = is_specialization_v<value_type, std::chrono::duration>;
25+
26+
template <time_type value_type = std::chrono::nanoseconds> class stop_watch {
27+
public:
28+
using hr_clock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
29+
std::chrono::high_resolution_clock, std::chrono::steady_clock>;
30+
static constexpr bool lock_free{ std::atomic<value_type>::is_always_lock_free };
31+
using time_type = std::conditional_t<lock_free, value_type, uint64_t>;
32+
33+
stop_watch(uint64_t newTime) noexcept { total_time_units.store(time_type{ newTime }, std::memory_order_release); }
34+
35+
stop_watch & operator=(stop_watch && other) noexcept {
36+
if (this != &other) {
37+
total_time_units.store(other.total_time_units.load(std::memory_order_acquire), std::memory_order_release);
38+
start_time_units.store(other.start_time_units.load(std::memory_order_acquire), std::memory_order_release);
39+
}
40+
return *this;
41+
}
42+
43+
stop_watch(stop_watch && other) noexcept { *this = std::move(other); }
44+
45+
stop_watch & operator=(const stop_watch & other) noexcept {
46+
if (this != &other) {
47+
total_time_units.store(other.total_time_units.load(std::memory_order_acquire), std::memory_order_release);
48+
start_time_units.store(other.start_time_units.load(std::memory_order_acquire), std::memory_order_release);
49+
}
50+
return *this;
51+
}
52+
53+
stop_watch(const stop_watch & other) noexcept { *this = other; }
54+
55+
bool has_time_elapsed() noexcept {
56+
return ((get_current_time() - start_time_units.load(std::memory_order_acquire)) >=
57+
total_time_units.load(std::memory_order_acquire));
58+
}
59+
60+
void add_time() noexcept {
61+
std::unique_lock lock{ mutex };
62+
values.emplace_back(total_time_elapsed());
63+
lock.release();
64+
reset();
65+
}
66+
67+
uint64_t get_average(time_type newTimeValue = time_type{}) noexcept {
68+
std::unique_lock lock{ mutex };
69+
uint64_t total_time{};
70+
for (auto & value : values) {
71+
total_time += get_value_as_uint(value);
72+
}
73+
return total_time / ((values.size() > 0) ? values.size() : 1);
74+
}
75+
76+
void reset(time_type newTimeValue = time_type{}) noexcept {
77+
if (newTimeValue != time_type{}) {
78+
total_time_units.store(newTimeValue, std::memory_order_release);
79+
}
80+
start_time_units.store(get_current_time(), std::memory_order_release);
81+
}
82+
83+
uint64_t get_total_wait_time() const noexcept {
84+
return get_value_as_uint(total_time_units.load(std::memory_order_acquire));
85+
}
86+
87+
time_type total_time_elapsed() noexcept {
88+
return get_current_time() - start_time_units.load(std::memory_order_acquire);
89+
}
90+
91+
uint64_t total_time_elapsed_uint64() noexcept {
92+
return get_value_as_uint(get_current_time()) -
93+
get_value_as_uint(start_time_units.load(std::memory_order_acquire));
94+
}
95+
96+
protected:
97+
std::atomic<time_type> total_time_units{};
98+
std::atomic<time_type> start_time_units{};
99+
std::vector<time_type> values{};
100+
std::mutex mutex{};
101+
102+
time_type get_current_time() {
103+
if constexpr (lock_free) {
104+
return std::chrono::duration_cast<value_type>(hr_clock::now().time_since_epoch());
105+
} else {
106+
return std::chrono::duration_cast<value_type>(hr_clock::now().time_since_epoch()).count();
107+
}
108+
}
109+
110+
uint64_t get_value_as_uint(time_type time) {
111+
if constexpr (lock_free) {
112+
return time.count();
113+
} else {
114+
return time;
115+
}
116+
}
117+
};
118+
15119
//
16120
// llama_context
17121
//
@@ -895,7 +999,7 @@ std::string format_dimensions(const int64_t * ne, int max_dims = GGML_MAX_DIMS)
895999
}
8961000

8971001
// Main pretty print function
898-
void pretty_print_tensor(const struct ggml_tensor * tensor, std::ostream & os) {
1002+
void pretty_print_tensor(const struct ggml_tensor * tensor, std::ostream & os, bool input = false) {
8991003
if (!tensor) {
9001004
os << "NULL tensor\n";
9011005
return;
@@ -908,19 +1012,36 @@ void pretty_print_tensor(const struct ggml_tensor * tensor, std::ostream & os) {
9081012
} else {
9091013
tensor_name = "<unnamed>";
9101014
}
911-
1015+
std::string tab{};
1016+
if (input) {
1017+
tab = " ";
1018+
}
9121019
// Format output with nice alignment
9131020
const int label_width = 12;
914-
915-
os << "────────────────────────────────────────\n";
916-
os << "" << std::left << std::setw(37) << ("Tensor: " + tensor_name) << "\n";
917-
os << "────────────────────────────────────────\n";
918-
os << "" << std::left << std::setw(label_width) << "Type:" << std::left << std::setw(24)
1021+
if (!input) {
1022+
os << "────────────────────────────────────────\n";
1023+
os << tab << std::left << std::setw(37) << ("Tensor: " + tensor_name) << "\n";
1024+
os << "────────────────────────────────────────\n";
1025+
} else {
1026+
os << tab << std::left << std::setw(37) << ("Tensor: " + tensor_name) << "\n";
1027+
}
1028+
os << tab << std::left << std::setw(label_width) << "Type:" << std::left << std::setw(24)
9191029
<< ggml_type_name(tensor->type) << " \n";
920-
os << "" << std::left << std::setw(label_width) << "Dimensions:" << std::left << std::setw(24)
1030+
os << tab << std::left << std::setw(label_width) << "Dimensions:" << std::left << std::setw(24)
9211031
<< format_dimensions(tensor->ne) << " \n";
922-
os << "" << std::left << std::setw(label_width) << "Operation:" << std::left << std::setw(24)
923-
<< ggml_op_name(tensor->op) << " \n";
1032+
if (!input) {
1033+
os << "" << std::left << std::setw(label_width) << "Operation:" << std::left << std::setw(24)
1034+
<< ggml_op_name(tensor->op) << " \n";
1035+
os << "" << std::left << std::setw(label_width) << "Inputs:";
1036+
size_t input{};
1037+
for (ggml_tensor * const * tensor_new = tensor->src; *tensor_new; ++tensor_new) {
1038+
++input;
1039+
}
1040+
os << "" << std::left << std::setw(label_width) << std::to_string(input) << " \n";
1041+
for (ggml_tensor * const * tensor_new = tensor->src; *tensor_new; ++tensor_new) {
1042+
pretty_print_tensor(*tensor_new, os, true);
1043+
}
1044+
}
9241045

9251046
// Calculate total elements
9261047
int64_t total_elements = 1;
@@ -929,12 +1050,16 @@ void pretty_print_tensor(const struct ggml_tensor * tensor, std::ostream & os) {
9291050
total_elements *= tensor->ne[i];
9301051
}
9311052
}
932-
os << "" << std::left << std::setw(label_width) << "Elements:" << std::left << std::setw(24) << total_elements
933-
<< "\n";
1053+
if (!input) {
1054+
os << "" << std::left << std::setw(label_width) << "Elements:" << std::left << std::setw(24) << total_elements
1055+
<< "\n";
9341056

935-
os << "─────────────────────────────────────────\n";
1057+
os << "─────────────────────────────────────────\n";
1058+
}
9361059
}
9371060

1061+
stop_watch stop_watch_val{ 0 };
1062+
9381063
bool save_string_to_file(const std::string & content, const std::string & filename) {
9391064
std::ofstream file(filename);
9401065
if (!file.is_open()) {
@@ -1074,17 +1199,18 @@ int llama_context::decode(llama_batch & inp_batch) {
10741199
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
10751200

10761201
ggml_backend_sched_alloc_graph(sched.get(), gf);
1077-
std::stringstream stream{};
1078-
for (size_t x = 0; x < gf->n_leafs; ++x) {
1079-
pretty_print_tensor(gf->leafs[x], stream);
1080-
}
1081-
for (size_t x = 0; x < gf->n_nodes; ++x) {
1082-
pretty_print_tensor(gf->nodes[x], stream);
1083-
}
1084-
save_string_to_file(stream.str(), "../../../../../TensorData.txt");
1202+
//std::stringstream stream{};
1203+
//for (size_t x = 0; x < gf->n_leafs; ++x) {
1204+
//pretty_print_tensor(gf->leafs[x], stream);
1205+
//}
1206+
//for (size_t x = 0; x < gf->n_nodes; ++x) {
1207+
//pretty_print_tensor(gf->nodes[x], stream);
1208+
//}
1209+
//save_string_to_file(stream.str(), "../../../../../TensorData.txt");
10851210
res->set_inputs(&ubatch);
1086-
1211+
stop_watch_val.reset();
10871212
const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
1213+
std::cout << "GRAPH COMPUTE TIME: " << stop_watch_val.total_time_elapsed() << std::endl;
10881214
if (compute_status != GGML_STATUS_SUCCESS) {
10891215
switch (compute_status) {
10901216
case GGML_STATUS_ABORTED:

0 commit comments

Comments
 (0)