|
14 | 14 | #include <sstream> |
15 | 15 | #include <string> |
16 | 16 | #include <vector> |
| 17 | +#include <mutex> |
17 | 18 |
|
18 | 19 | // Forward declarations for internal cache access |
19 | 20 | struct llama_memory_hybrid; |
@@ -92,6 +93,7 @@ static void sigint_handler(int signo) { |
92 | 93 | struct callback_data { |
93 | 94 | std::vector<uint8_t> data; |
94 | 95 | std::map<std::string, int32_t> tensors; |
| 96 | + std::mutex mutex; |
95 | 97 | }; |
96 | 98 |
|
97 | 99 |
|
@@ -210,6 +212,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne |
210 | 212 |
|
211 | 213 | static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { |
212 | 214 | auto * cb_data = (callback_data *) user_data; |
| 215 | + std::lock_guard<std::mutex> lock(cb_data->mutex); |
213 | 216 |
|
214 | 217 | const struct ggml_tensor * src0 = t->src[0]; |
215 | 218 | const struct ggml_tensor * src1 = t->src[1]; |
@@ -241,16 +244,18 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { |
241 | 244 |
|
242 | 245 | if (!ggml_is_quantized(t->type)) { |
243 | 246 | uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); |
244 | | - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); |
245 | | - if (std::string(t->name).substr(0, std::string("post_moe-").size()) == "post_moe-" || |
246 | | - std::string(t->name).substr(0, std::string("state_1d-").size()) == "state_1d-") { |
247 | | - if (cb_data->tensors.count(t->name) == 0) { |
248 | | - cb_data->tensors[t->name] = 1; |
| 247 | + std::string tensor_name(t->name); |
| 248 | + if (std::string(tensor_name).substr(0, std::string("post_moe-").size()) == "post_moe-" || |
| 249 | + std::string(tensor_name).substr(0, std::string("state_1d-").size()) == "state_1d-") { |
| 250 | + |
| 251 | + if (cb_data->tensors.count(tensor_name) == 0) { |
| 252 | + cb_data->tensors[tensor_name] = 1; |
249 | 253 | } else { |
250 | | - cb_data->tensors[t->name]++; |
| 254 | + cb_data->tensors[tensor_name]++; |
251 | 255 | } |
252 | | - save_tensor(t, data, (std::string(t->name) + "_" + std::to_string(cb_data->tensors[t->name]) + ".bin").c_str()); |
| 256 | + save_tensor(t, data, (tensor_name + "_" + std::to_string(cb_data->tensors[t->name]) + ".bin").c_str()); |
253 | 257 | } |
| 258 | + ggml_print_tensor(data, t->type, t->ne, t->nb, 3); |
254 | 259 | } |
255 | 260 |
|
256 | 261 | return true; |
@@ -312,9 +317,9 @@ int main(int argc, char ** argv) { |
312 | 317 | std::vector<common_chat_msg> chat_msgs; |
313 | 318 |
|
314 | 319 | // load the model and apply lora adapter, if any |
| 320 | + callback_data cb_data; |
315 | 321 | if (params.n_predict > 0 && params.n_predict < 50) { |
316 | 322 | // enable debug prints if we print small number of tokens |
317 | | - callback_data cb_data; |
318 | 323 | params.cb_eval = ggml_debug; |
319 | 324 | params.cb_eval_user_data = &cb_data; |
320 | 325 | } |
|
0 commit comments