wip

ggerganov · ggerganov · commit 322c7d3af781 · 2025-01-06T17:00:57.000+02:00
ggml-ci
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
@@ -257,7 +257,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
         }
 
         // device buft and device ctx
-        const auto * model_tensor = model.get_tensor( name.c_str());
+        const auto * model_tensor = model.get_tensor(name.c_str());
         if (!model_tensor) {
             throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
         }
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -7,6 +7,10 @@
 #include <cstring>
 #include <future>
 
+static const size_t kiB = 1024;
+static const size_t MiB = 1024*kiB;
+static const size_t GiB = 1024*MiB;
+
 const char * llama_file_version_name(llama_fver version) {
     switch (version) {
         case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
@@ -17,6 +21,49 @@ const char * llama_file_version_name(llama_fver version) {
     return "unknown";
 }
 
+static std::string llama_model_ftype_name(llama_ftype ftype) {
+    if (ftype & LLAMA_FTYPE_GUESSED) {
+        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+    }
+
+    switch (ftype) {
+        case LLAMA_FTYPE_ALL_F32:         return "all F32";
+        case LLAMA_FTYPE_MOSTLY_F16:      return "F16";
+        case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16";
+        case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
+        case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
+        case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
+        case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
+        case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
+        case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_M:   return "Q3_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_L:   return "Q3_K - Large";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_S:   return "Q4_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_M:   return "Q4_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q6_K:     return "Q6_K";
+        case LLAMA_FTYPE_MOSTLY_TQ1_0:    return "TQ1_0 - 1.69 bpw ternary";
+        case LLAMA_FTYPE_MOSTLY_TQ2_0:    return "TQ2_0 - 2.06 bpw ternary";
+        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
+
+        default: return "unknown, may not work";
+    }
+}
+
 namespace GGUFMeta {
     template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
     struct GKV_Base_Type {
@@ -1008,3 +1055,17 @@ bool llama_model_loader::load_all_data(
 
     return true;
 }
+
+std::string llama_model_loader::ftype_name() const {
+    return llama_model_ftype_name(ftype);
+}
+
+void llama_model_loader::print_info() const {
+    LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
+    LLAMA_LOG_INFO("%s: file type   = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
+    if (n_bytes < GiB) {
+        LLAMA_LOG_INFO("%s: file size   = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0,        n_bytes*8.0/n_elements);
+    } else {
+        LLAMA_LOG_INFO("%s: file size   = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
+    }
+}
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
@@ -155,4 +155,8 @@ struct llama_model_loader {
             llama_mlocks * lmlocks,
             llama_progress_callback progress_callback,
             void * progress_callback_user_data);
+
+    std::string ftype_name() const;
+
+    void print_info() const;
 };
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -11,10 +11,6 @@
 #include <sstream>
 #include <stdexcept>
 
-static const size_t kiB = 1024;
-static const size_t MiB = 1024*kiB;
-static const size_t GiB = 1024*MiB;
-
 const char * llm_type_name(llm_type type) {
     switch (type) {
         case MODEL_14M:           return "14M";
@@ -83,49 +79,6 @@ const char * llm_type_name(llm_type type) {
     }
 }
 
-static std::string llama_model_ftype_name(llama_ftype ftype) {
-    if (ftype & LLAMA_FTYPE_GUESSED) {
-        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
-    }
-
-    switch (ftype) {
-        case LLAMA_FTYPE_ALL_F32:         return "all F32";
-        case LLAMA_FTYPE_MOSTLY_F16:      return "F16";
-        case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16";
-        case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
-        case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
-        case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
-        case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
-        case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
-        case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_M:   return "Q3_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L:   return "Q3_K - Large";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S:   return "Q4_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M:   return "Q4_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q6_K:     return "Q6_K";
-        case LLAMA_FTYPE_MOSTLY_TQ1_0:    return "TQ1_0 - 1.69 bpw ternary";
-        case LLAMA_FTYPE_MOSTLY_TQ2_0:    return "TQ2_0 - 2.06 bpw ternary";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
-
-        default: return "unknown, may not work";
-    }
-}
-
 static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
     switch (type) {
         case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
@@ -142,10 +95,6 @@ std::string llama_model::type_name() const {
     return llm_type_name(type);
 }
 
-std::string llama_model::ftype_name() const {
-    return llama_model_ftype_name(ftype);
-}
-
 template<typename F>
 static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
     ggml_init_params params = {
@@ -208,6 +157,10 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
     return it->second;
 }
 
+size_t llama_model::size() const {
+    return n_bytes;
+}
+
 size_t llama_model::max_nodes() const {
     return std::max<size_t>(8192, tensors_by_name.size()*5);
 }
@@ -1100,7 +1053,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         default: throw std::runtime_error("unsupported model architecture");
     }
 
-    ftype = ml.ftype;
+    n_bytes = ml.n_bytes;
+
+    desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
 
     if (hparams.f_max_alibi_bias > 0.0f) {
         hparams.use_alibi = true;
@@ -1820,7 +1775,11 @@ void llama_model::load_vocab(llama_model_loader & ml) {
     }
 }
 
-void llama_model::print_meta(llama_model_loader & ml) {
+std::string llama_model::desc() const {
+    return desc_str;
+}
+
+void llama_model::print_info() const {
     const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
 
     auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
@@ -1853,7 +1812,6 @@ void llama_model::print_meta(llama_model_loader & ml) {
     };
 
     // hparams
-    LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver));
     LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, arch_name().c_str());
     LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, llama_model_vocab_type_name(vocab.type));
     LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, hparams.n_vocab);
@@ -1897,20 +1855,14 @@ void llama_model::print_meta(llama_model_loader & ml) {
     }
 
     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, type_name().c_str());
-    LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, ftype_name().c_str());
-    if (ml.n_elements >= 1e12) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, ml.n_elements*1e-12);
-    } else if (ml.n_elements >= 1e9) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
-    } else if (ml.n_elements >= 1e6) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f M\n", __func__, ml.n_elements*1e-6);
-    } else {
-        LLAMA_LOG_INFO("%s: model params     = %.2f K\n", __func__, ml.n_elements*1e-3);
-    }
-    if (ml.n_bytes < GiB) {
-        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
+    if (n_elements >= 1e12) {
+        LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, n_elements*1e-12);
+    } else if (n_elements >= 1e9) {
+        LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, n_elements*1e-9);
+    } else if (n_elements >= 1e6) {
+        LLAMA_LOG_INFO("%s: model params     = %.2f M\n", __func__, n_elements*1e-6);
     } else {
-        LLAMA_LOG_INFO("%s: model size       = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
+        LLAMA_LOG_INFO("%s: model params     = %.2f K\n", __func__, n_elements*1e-3);
     }
 
     // general kv
@@ -2154,14 +2106,11 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
 }
 
 int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
-    return snprintf(buf, buf_size, "%s %s %s",
-            model->arch_name ().c_str(),
-            model->type_name ().c_str(),
-            model->ftype_name().c_str());
+    return snprintf(buf, buf_size, "%s", model->desc().c_str());
 }
 
 uint64_t llama_model_size(const struct llama_model * model) {
-    return model->n_bytes;
+    return model->size();
 }
 
 uint64_t llama_model_n_params(const struct llama_model * model) {
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -286,12 +286,10 @@ struct llama_model {
     llm_type type = MODEL_UNKNOWN;
     llm_arch arch = LLM_ARCH_UNKNOWN;
 
-    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
     std::string name = "n/a";
 
     llama_hparams hparams = {};
-    llama_vocab   vocab;
+    llama_vocab   vocab = {};
 
     struct ggml_tensor * tok_embd   = nullptr;
     struct ggml_tensor * type_embd  = nullptr;
@@ -320,6 +318,7 @@ struct llama_model {
     std::unordered_map<std::string, std::string> gguf_kv;
 
     llama_split_mode split_mode;
+
     int main_gpu;
     int n_gpu_layers;
 
@@ -328,7 +327,6 @@ struct llama_model {
     // list of devices used in this model
     std::vector<ggml_backend_dev_t> devices;
 
-
     // lists of buffer types used for each layer
     using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
     buft_list_t cpu_buft_list;
@@ -365,27 +363,30 @@ struct llama_model {
     // total number of parameters in the model
     uint64_t n_elements = 0;
 
-    // total size of all the tensors in the model in bytes
-    size_t  n_bytes     = 0;
-
-    std::string arch_name() const;
-    std::string type_name() const;
-    std::string ftype_name() const;
-
     ggml_backend_buffer_type_t select_buft(int il) const;
 
     const struct ggml_tensor * get_tensor(const char * name) const;
 
-    size_t max_nodes() const;
-
     void load_stats  (llama_model_loader & ml);
     void load_arch   (llama_model_loader & ml);
     void load_hparams(llama_model_loader & ml);
     void load_vocab  (llama_model_loader & ml);
 
-    void print_meta(llama_model_loader & ml);
+    std::string arch_name() const;
+    std::string type_name() const;
+
+    std::string desc() const;
+
+    size_t size() const;
+    size_t max_nodes() const;
+
+    void print_info() const;
 
 private:
+    size_t n_bytes = 0;
+
+    std::string desc_str;
+
     std::string token_to_piece(llama_token token, bool special) const;
 };
 
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -537,7 +537,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     struct quantize_state_impl qs(model, params);
 
     if (params->only_copy) {
-        ftype = model.ftype;
+        ftype = ml.ftype;
     }
     const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
     if (params->imatrix) {
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -2462,6 +2462,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
     try {
         llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
 
+        ml.print_info();
+
         model.hparams.vocab_only = params.vocab_only;
 
         try {
@@ -2481,7 +2483,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
 
         model.load_stats(ml);
-        model.print_meta(ml);
+        model.print_info();
 
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
             model.hparams.n_vocab != model.vocab.id_to_token.size()) {

Original file line number	Diff line number	Diff line change
`@@ -257,7 +257,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char`
`257`	`257`	`}`
`258`	`258`
`259`	`259`	`// device buft and device ctx`
`260`		`- const auto * model_tensor = model.get_tensor( name.c_str());`
	`260`	`+ const auto * model_tensor = model.get_tensor(name.c_str());`
`261`	`261`	`if (!model_tensor) {`
`262`	`262`	`throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");`
`263`	`263`	`}`
Original file line number	Diff line number	Diff line change
`@@ -537,7 +537,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::`
`537`	`537`	`struct quantize_state_impl qs(model, params);`
`538`	`538`
`539`	`539`	`if (params->only_copy) {`
`540`		`- ftype = model.ftype;`
	`540`	`+ ftype = ml.ftype;`
`541`	`541`	`}`
`542`	`542`	`const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;`
`543`	`543`	`if (params->imatrix) {`