1313#include < thread>
1414#include < unordered_map>
1515
16- // static std::vector prune_map = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29};
17- static std::vector<int > prune_map = {3 };
18-
1916static void zeros (std::ofstream & file, size_t n) {
2017 char zero = 0 ;
2118 for (size_t i = 0 ; i < n; ++i) {
@@ -64,7 +61,7 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
6461
6562 for (const auto & p : mapped) {
6663 if (p.second == blk) {
67- // LLAMA_LOG_DEBUG("(imatrix -> %d ) ", p.first);
64+ LLAMA_LOG_DEBUG (" (blk.%d imatrix ) " , p.first );
6865 return new_name.replace (match.position (1 ), match.length (1 ), std::to_string (p.first ));
6966 }
7067 }
@@ -621,14 +618,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
621618 const size_t align = GGUF_DEFAULT_ALIGNMENT;
622619 gguf_context_ptr ctx_out { gguf_init_empty () };
623620
621+ std::vector<int > prune_list = {};
622+ if (params->prune_layers ) {
623+ prune_list = *static_cast <const std::vector<int > *>(params->prune_layers );
624+ }
625+
624626 // copy the KV pairs from the input file
625627 gguf_set_kv (ctx_out.get (), ml.meta .get ());
626628 gguf_set_val_u32 (ctx_out.get (), " general.quantization_version" , GGML_QNT_VERSION); // TODO: use LLM_KV
627629 gguf_set_val_u32 (ctx_out.get (), " general.file_type" , ftype); // TODO: use LLM_KV
628630
629- // ToDo: Add test for --tensor-prune condition
630- const auto block_count = gguf_get_val_u32 (ctx_out.get (), LLM_KV_BLOCK_COUNT) - prune_map.size ();
631- gguf_set_val_u32 (ctx_out.get (), ml.llm_kv (LLM_KV_BLOCK_COUNT).c_str (), block_count);
631+ if (!prune_list.empty ()) {
632+ const auto block_count = gguf_get_val_u32 (ctx_out.get (), LLM_KV_BLOCK_COUNT) - prune_list.size ();
633+ gguf_set_val_u32 (ctx_out.get (), ml.llm_kv (LLM_KV_BLOCK_COUNT).c_str (), block_count);
634+ }
632635
633636 // Remove split metadata
634637 gguf_remove_key (ctx_out.get (), ml.llm_kv (LLM_KV_SPLIT_NO).c_str ());
@@ -661,8 +664,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
661664 std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
662665 tensors.reserve (ml.weights_map .size ());
663666 for (const auto & it : ml.weights_map ) {
664- // ToDo: Add test for --tensor-prune condition
665- const std::string remapped_name (remap_layer (it.first , prune_map, mapped, next_blk_id));
667+ const std::string remapped_name (remap_layer (it.first , prune_list, mapped, next_blk_id));
666668 if (remapped_name == " X" ) {
667669 if (it.first .find (" attn_v.weight" ) != std::string::npos ||
668670 it.first .find (" attn_qkv.weight" ) != std::string::npos ||
@@ -673,7 +675,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
673675 continue ;
674676 } else if (remapped_name != it.first ) {
675677 ggml_set_name (it.second .tensor , remapped_name.c_str ());
676- // LLAMA_LOG_DEBUG("%s: tensor %s remmaped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
678+ LLAMA_LOG_DEBUG (" %s: tensor %s remapped to %s\n " , __func__, it.first .c_str (), ggml_get_name (it.second .tensor ));
677679 }
678680 tensors.push_back (&it.second );
679681 }
@@ -1019,6 +1021,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
10191021 /* .imatrix =*/ nullptr ,
10201022 /* .kv_overrides =*/ nullptr ,
10211023 /* .tensor_type =*/ nullptr ,
1024+ /* .prune_layers =*/ nullptr
10221025 };
10231026
10241027 return result;
0 commit comments