|
5 | 5 | #include "llama-model-loader.h" |
6 | 6 |
|
7 | 7 | #include <algorithm> |
8 | | -#include <cmath> |
9 | 8 | #include <cstring> |
10 | 9 | #include <cinttypes> |
11 | 10 | #include <fstream> |
12 | | -#include <mutex> |
13 | 11 | #include <thread> |
14 | 12 | #include <unordered_map> |
15 | 13 |
|
@@ -48,7 +46,7 @@ struct quantize_state_impl { |
48 | 46 | }; |
49 | 47 |
|
50 | 48 | static void llama_tensor_dequantize_impl( |
51 | | - struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, |
| 49 | + ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, |
52 | 50 | const size_t nelements, const int nthread |
53 | 51 | ) { |
54 | 52 | if (output.size() < nelements) { |
@@ -536,7 +534,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: |
536 | 534 | model.load_hparams(ml); |
537 | 535 | model.load_stats (ml); |
538 | 536 |
|
539 | | - struct quantize_state_impl qs(model, params); |
| 537 | + quantize_state_impl qs(model, params); |
540 | 538 |
|
541 | 539 | if (params->only_copy) { |
542 | 540 | ftype = ml.ftype; |
@@ -661,7 +659,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: |
661 | 659 | // populate the original tensors so we get an initial meta data |
662 | 660 | for (const auto * it : tensors) { |
663 | 661 | uint16_t i_split = params->keep_split ? it->idx : 0; |
664 | | - struct ggml_tensor * tensor = it->tensor; |
| 662 | + ggml_tensor * tensor = it->tensor; |
665 | 663 | if (!ctx_outs[i_split]) { |
666 | 664 | ctx_outs[i_split].reset(gguf_init_empty()); |
667 | 665 | } |
@@ -710,7 +708,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: |
710 | 708 | new_ofstream(0); |
711 | 709 | for (const auto * it : tensors) { |
712 | 710 | const auto & weight = *it; |
713 | | - struct ggml_tensor * tensor = weight.tensor; |
| 711 | + ggml_tensor * tensor = weight.tensor; |
714 | 712 | if (weight.idx != cur_split && params->keep_split) { |
715 | 713 | close_ofstream(); |
716 | 714 | new_ofstream(weight.idx); |
@@ -776,7 +774,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: |
776 | 774 | // do not quantize relative position bias (T5) |
777 | 775 | quantize &= name.find("attn_rel_b.weight") == std::string::npos; |
778 | 776 |
|
779 | | - enum ggml_type new_type; |
| 777 | + ggml_type new_type; |
780 | 778 | void * new_data; |
781 | 779 | size_t new_size; |
782 | 780 |
|
@@ -950,8 +948,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: |
950 | 948 | // interface implementation |
951 | 949 | // |
952 | 950 |
|
953 | | -struct llama_model_quantize_params llama_model_quantize_default_params() { |
954 | | - struct llama_model_quantize_params result = { |
| 951 | +llama_model_quantize_params llama_model_quantize_default_params() { |
| 952 | + llama_model_quantize_params result = { |
955 | 953 | /*.nthread =*/ 0, |
956 | 954 | /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, |
957 | 955 | /*.output_tensor_type =*/ GGML_TYPE_COUNT, |
|
0 commit comments