1- #include " llama.h"
21#include " llama-impl.h"
32#include " llama-model.h"
43#include " llama-model-loader.h"
4+ #include " llama-ext.h"
55
6+ #include < algorithm>
67#include < cmath>
78#include < cstring>
8- #include < string>
99#include < cinttypes>
1010#include < fstream>
1111#include < mutex>
@@ -197,6 +197,7 @@ struct quantize_state_impl {
197197
198198// per-tensor metadata, computed in the preliminary loop and used in the main loop
199199struct tensor_metadata {
200+ std::string name;
200201 ggml_type target_type;
201202 tensor_category category;
202203 std::string remapped_imatrix_name;
@@ -788,7 +789,7 @@ static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type ds
788789// given a file type, get the default tensor type
789790//
790791
791- static ggml_type llama_ftype_get_default_type (llama_ftype ftype) {
792+ ggml_type llama_ftype_get_default_type (llama_ftype ftype) {
792793 switch (ftype) {
793794 case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
794795 case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
@@ -827,16 +828,32 @@ static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
827828 case LLAMA_FTYPE_MOSTLY_IQ3_S:
828829 case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
829830
830- default : throw std::runtime_error ( format ( " invalid output file type %d \n " , ftype)) ;
831+ default : return GGML_TYPE_COUNT ;
831832 }
832833}
833834
835+
836+ static void init_quantize_state_counters (quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
837+ for (auto & tm : metadata) {
838+ tensor_category cat = tensor_get_category (tm.name );
839+ tm.category = cat;
840+
841+ if (category_is_attn_v (cat)) {
842+ ++qs.n_attention_wv ;
843+ }
844+
845+ if (cat == tensor_category::OUTPUT) {
846+ qs.has_tied_embeddings = false ;
847+ }
848+ }
849+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int )qs.model .hparams .n_layer ;
850+ }
851+
834852//
835853// main quantization driver
836854//
837855
838856static void llama_model_quantize_impl (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
839- ggml_type default_type;
840857 llama_ftype ftype = params->ftype ;
841858
842859 int nthread = params->nthread ;
@@ -845,7 +862,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
845862 nthread = std::thread::hardware_concurrency ();
846863 }
847864
848- default_type = llama_ftype_get_default_type (ftype);
865+ ggml_type default_type = llama_ftype_get_default_type (ftype);
866+ if (default_type == GGML_TYPE_COUNT) {
867+ throw std::runtime_error (format (" invalid output file type %d\n " , ftype));
868+ }
849869
850870 // mmap consistently increases speed on Linux, and also increases speed on Windows with
851871 // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
@@ -964,6 +984,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
964984 });
965985 }
966986
987+ // compute tensor metadata once and cache it
988+ std::vector<tensor_metadata> metadata (tensors.size ());
989+ for (size_t i = 0 ; i < tensors.size (); ++i) {
990+ metadata[i].name = ggml_get_name (tensors[i]->tensor );
991+ }
992+
993+ // initialize quantization state counters and metadata categories
994+ init_quantize_state_counters (qs, metadata);
995+
967996 int idx = 0 ;
968997 uint16_t n_split = 1 ;
969998
@@ -976,25 +1005,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
9761005 std::vector<gguf_context_ptr> ctx_outs (n_split);
9771006 ctx_outs[0 ] = std::move (ctx_out);
9781007
979- // compute tensor metadata once and cache it
980- std::vector<tensor_metadata> metadata (tensors.size ());
981-
982- // initialize quantization state before preliminary loop (counters for use_more_bits)
983- {
984- for (size_t i = 0 ; i < tensors.size (); ++i) {
985- const auto cat = tensor_get_category (tensors[i]->tensor ->name );
986- if (category_is_attn_v (cat)) {
987- ++qs.n_attention_wv ;
988- }
989- if (cat == tensor_category::OUTPUT) {
990- qs.has_tied_embeddings = false ;
991- }
992- metadata[i].category = cat; // save and re-use the category while we're at it
993- }
994- // these also need to be set to n_layer by default
995- qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int )qs.model .hparams .n_layer ;
996- }
997-
9981008 // flag for --dry-run
9991009 bool will_require_imatrix = false ;
10001010
@@ -1005,7 +1015,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
10051015 for (size_t i = 0 ; i < tensors.size (); ++i) {
10061016 const auto * it = tensors[i];
10071017 const struct ggml_tensor * tensor = it->tensor ;
1008- const std::string name = ggml_get_name (tensor);
10091018
10101019 uint16_t i_split = params->keep_split ? it->idx : 0 ;
10111020 if (!ctx_outs[i_split]) {
@@ -1034,7 +1043,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
10341043 " - offending tensor: %s\n "
10351044 " - target type: %s\n "
10361045 " ============================================================================\n\n " ,
1037- name.c_str (), ggml_type_name (metadata[i].target_type ));
1046+ metadata[i]. name .c_str (), ggml_type_name (metadata[i].target_type ));
10381047 throw std::runtime_error (" this quantization requires an imatrix!" );
10391048 }
10401049 }
@@ -1107,7 +1116,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
11071116 new_ofstream (weight.idx );
11081117 }
11091118
1110- const std::string name = ggml_get_name (tensor);
11111119 const size_t tensor_size = ggml_nbytes (tensor);
11121120
11131121 if (!params->dry_run ) {
@@ -1238,9 +1246,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
12381246 total_size_new += new_size;
12391247
12401248 // update the gguf meta data as we go
1241- gguf_set_tensor_type (ctx_outs[cur_split].get (), name.c_str (), new_type);
1242- GGML_ASSERT (gguf_get_tensor_size (ctx_outs[cur_split].get (), gguf_find_tensor (ctx_outs[cur_split].get (), name.c_str ())) == new_size);
1243- gguf_set_tensor_data (ctx_outs[cur_split].get (), name.c_str (), new_data);
1249+ gguf_set_tensor_type (ctx_outs[cur_split].get (), metadata[i]. name .c_str (), new_type);
1250+ GGML_ASSERT (gguf_get_tensor_size (ctx_outs[cur_split].get (), gguf_find_tensor (ctx_outs[cur_split].get (), metadata[i]. name .c_str ())) == new_size);
1251+ gguf_set_tensor_data (ctx_outs[cur_split].get (), metadata[i]. name .c_str (), new_data);
12441252
12451253 // write tensor data + padding
12461254 fout.write ((const char *) new_data, new_size);
@@ -1305,3 +1313,89 @@ uint32_t llama_model_quantize(
13051313
13061314 return 0 ;
13071315}
1316+
1317+ //
1318+ // Helper functions for external tools exposed in llama-ext.h
1319+ //
1320+
1321+ quantize_state_impl * llama_quant_init (
1322+ const llama_model * model,
1323+ const llama_model_quantize_params * params) {
1324+ return new quantize_state_impl (*model, params);
1325+ }
1326+
1327+ void llama_quant_free (quantize_state_impl * qs) {
1328+ delete qs;
1329+ }
1330+
1331+ llama_model * llama_quant_model_from_metadata (const llama_quant_model_desc * desc) {
1332+ struct llama_model_params mparams = llama_model_default_params ();
1333+ auto * model = new llama_model (mparams);
1334+
1335+ model->arch = llm_arch_from_string (desc->architecture );
1336+
1337+ // infer llm_type: only LLM_TYPE_70B matters for quantization logic
1338+ if (model->arch == LLM_ARCH_LLAMA && desc->n_layer == 80 && desc->n_head != desc->n_head_kv ) {
1339+ model->type = LLM_TYPE_70B;
1340+ }
1341+
1342+ model->hparams .n_embd = desc->n_embd ;
1343+ model->hparams .n_embd_head_k_full = desc->n_embd_head_k ;
1344+ model->hparams .n_embd_head_v_full = desc->n_embd_head_v ;
1345+ model->hparams .n_layer = desc->n_layer ;
1346+ model->hparams .n_expert = desc->n_expert ;
1347+
1348+ for (uint32_t i = 0 ; i < desc->n_layer ; i++) {
1349+ model->hparams .n_head_arr [i] = desc->n_head ;
1350+ model->hparams .n_head_kv_arr [i] = desc->n_head_kv ;
1351+ model->hparams .n_ff_arr [i] = desc->n_ff ;
1352+ }
1353+
1354+ return model;
1355+ }
1356+
1357+ bool llama_quant_tensor_allows_quantization (
1358+ const quantize_state_impl * qs,
1359+ const ggml_tensor * tensor) {
1360+ return tensor_allows_quantization (qs->params , qs->model .arch , tensor);
1361+ }
1362+
1363+ void llama_quant_compute_types (
1364+ quantize_state_impl * qs,
1365+ llama_ftype ftype,
1366+ ggml_tensor ** tensors,
1367+ ggml_type * result_types,
1368+ size_t n_tensors) {
1369+ // reset per-computation state
1370+ qs->n_attention_wv = 0 ;
1371+ qs->n_ffn_down = 0 ;
1372+ qs->n_ffn_gate = 0 ;
1373+ qs->n_ffn_up = 0 ;
1374+ qs->i_attention_wv = 0 ;
1375+ qs->i_ffn_down = 0 ;
1376+ qs->i_ffn_gate = 0 ;
1377+ qs->i_ffn_up = 0 ;
1378+ qs->n_fallback = 0 ;
1379+ qs->has_imatrix = false ;
1380+ qs->has_tied_embeddings = true ;
1381+
1382+ // build metadata from tensor names
1383+ std::vector<tensor_metadata> metadata (n_tensors);
1384+ for (size_t i = 0 ; i < n_tensors; i++) {
1385+ metadata[i].name = ggml_get_name (tensors[i]);
1386+ }
1387+
1388+ // initialize counters and categories
1389+ init_quantize_state_counters (*qs, metadata);
1390+
1391+ // use a local copy of params with the requested ftype
1392+ llama_model_quantize_params local_params = *qs->params ;
1393+ local_params.ftype = ftype;
1394+
1395+ ggml_type default_type = llama_ftype_get_default_type (ftype);
1396+
1397+ // compute types
1398+ for (size_t i = 0 ; i < n_tensors; i++) {
1399+ result_types[i] = llama_tensor_get_type (*qs, &local_params, tensors[i], default_type, metadata[i]);
1400+ }
1401+ }
0 commit comments