@@ -22,7 +22,7 @@ static void zeros(std::ofstream & file, size_t n) {
2222 }
2323}
2424
25- struct quantize_state_internal {
25+ struct quantize_state_impl {
2626 const llama_model & model;
2727 const llama_model_quantize_params * params;
2828
@@ -43,13 +43,13 @@ struct quantize_state_internal {
4343 // used to figure out if a model shares tok_embd with the output weight
4444 bool has_output = false ;
4545
46- quantize_state_internal (const llama_model & model, const llama_model_quantize_params * params)
46+ quantize_state_impl (const llama_model & model, const llama_model_quantize_params * params)
4747 : model(model)
4848 , params(params)
4949 {}
5050};
5151
52- static void llama_tensor_dequantize_internal (
52+ static void llama_tensor_dequantize_impl (
5353 struct ggml_tensor * tensor, std::vector<no_init<float >> & output, std::vector<std::thread> & workers,
5454 const size_t nelements, const int nthread
5555) {
@@ -121,7 +121,7 @@ static void llama_tensor_dequantize_internal(
121121 workers.clear ();
122122}
123123
124- static ggml_type llama_tensor_get_type (quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
124+ static ggml_type llama_tensor_get_type (quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
125125 const std::string name = ggml_get_name (tensor);
126126
127127 // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -410,7 +410,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
410410 return new_type;
411411}
412412
413- static size_t llama_tensor_quantize_internal (enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
413+ static size_t llama_tensor_quantize_impl (enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
414414 if (nthread < 2 ) {
415415 // single-thread
416416 size_t new_size = ggml_quantize_chunk (new_type, f32_data, new_data, 0 , nrows, n_per_row, imatrix);
@@ -464,7 +464,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
464464 return new_size;
465465}
466466
467- static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
467+ static void llama_model_quantize_impl (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
468468 ggml_type default_type;
469469 llama_ftype ftype = params->ftype ;
470470
@@ -534,7 +534,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
534534 llm_load_hparams (ml, model);
535535 llm_load_stats (ml, model);
536536
537- struct quantize_state_internal qs (model, params);
537+ struct quantize_state_impl qs (model, params);
538538
539539 if (params->only_copy ) {
540540 ftype = model.ftype ;
@@ -837,7 +837,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
837837 } else if (ggml_is_quantized (tensor->type ) && !params->allow_requantize ) {
838838 throw std::runtime_error (format (" requantizing from type %s is disabled" , ggml_type_name (tensor->type )));
839839 } else {
840- llama_tensor_dequantize_internal (tensor, f32_conv_buf, workers, nelements, nthread);
840+ llama_tensor_dequantize_impl (tensor, f32_conv_buf, workers, nelements, nthread);
841841 f32_data = (float *) f32_conv_buf.data ();
842842 }
843843
@@ -866,7 +866,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
866866 void * new_data_03 = (char *)new_data + ggml_row_size (new_type, n_per_row) * i03 * nrows;
867867 const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr ;
868868
869- new_size += llama_tensor_quantize_internal (new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
869+ new_size += llama_tensor_quantize_impl (new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
870870 }
871871 LLAMA_LOG_INFO (" size = %8.2f MiB -> %8.2f MiB\n " , ggml_nbytes (tensor)/1024.0 /1024.0 , new_size/1024.0 /1024.0 );
872872 }
@@ -919,7 +919,7 @@ uint32_t llama_model_quantize(
919919 const char * fname_out,
920920 const llama_model_quantize_params * params) {
921921 try {
922- llama_model_quantize_internal (fname_inp, fname_out, params);
922+ llama_model_quantize_impl (fname_inp, fname_out, params);
923923 } catch (const std::exception & err) {
924924 LLAMA_LOG_ERROR (" %s: failed to quantize: %s\n " , __func__, err.what ());
925925 return 1 ;
0 commit comments