Skip to content

Commit 322c7d3

Browse files
committed
wip
ggml-ci
1 parent 53e61c6 commit 322c7d3

File tree

7 files changed

+106
-89
lines changed

7 files changed

+106
-89
lines changed

src/llama-adapter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
257257
}
258258

259259
// device buft and device ctx
260-
const auto * model_tensor = model.get_tensor( name.c_str());
260+
const auto * model_tensor = model.get_tensor(name.c_str());
261261
if (!model_tensor) {
262262
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
263263
}

src/llama-model-loader.cpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
#include <cstring>
88
#include <future>
99

10+
static const size_t kiB = 1024;
11+
static const size_t MiB = 1024*kiB;
12+
static const size_t GiB = 1024*MiB;
13+
1014
const char * llama_file_version_name(llama_fver version) {
1115
switch (version) {
1216
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
@@ -17,6 +21,49 @@ const char * llama_file_version_name(llama_fver version) {
1721
return "unknown";
1822
}
1923

24+
static std::string llama_model_ftype_name(llama_ftype ftype) {
25+
if (ftype & LLAMA_FTYPE_GUESSED) {
26+
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
27+
}
28+
29+
switch (ftype) {
30+
case LLAMA_FTYPE_ALL_F32: return "all F32";
31+
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
32+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
33+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
34+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
35+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
38+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
39+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
40+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
41+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
42+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
43+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
44+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
45+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
46+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
47+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
48+
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
49+
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
50+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
51+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
52+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
53+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
54+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
55+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
56+
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
57+
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
58+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
59+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
60+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
61+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
62+
63+
default: return "unknown, may not work";
64+
}
65+
}
66+
2067
namespace GGUFMeta {
2168
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
2269
struct GKV_Base_Type {
@@ -1008,3 +1055,17 @@ bool llama_model_loader::load_all_data(
10081055

10091056
return true;
10101057
}
1058+
1059+
std::string llama_model_loader::ftype_name() const {
1060+
return llama_model_ftype_name(ftype);
1061+
}
1062+
1063+
void llama_model_loader::print_info() const {
1064+
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
1065+
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
1066+
if (n_bytes < GiB) {
1067+
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
1068+
} else {
1069+
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
1070+
}
1071+
}

src/llama-model-loader.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,4 +155,8 @@ struct llama_model_loader {
155155
llama_mlocks * lmlocks,
156156
llama_progress_callback progress_callback,
157157
void * progress_callback_user_data);
158+
159+
std::string ftype_name() const;
160+
161+
void print_info() const;
158162
};

src/llama-model.cpp

Lines changed: 21 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@
1111
#include <sstream>
1212
#include <stdexcept>
1313

14-
static const size_t kiB = 1024;
15-
static const size_t MiB = 1024*kiB;
16-
static const size_t GiB = 1024*MiB;
17-
1814
const char * llm_type_name(llm_type type) {
1915
switch (type) {
2016
case MODEL_14M: return "14M";
@@ -83,49 +79,6 @@ const char * llm_type_name(llm_type type) {
8379
}
8480
}
8581

86-
static std::string llama_model_ftype_name(llama_ftype ftype) {
87-
if (ftype & LLAMA_FTYPE_GUESSED) {
88-
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
89-
}
90-
91-
switch (ftype) {
92-
case LLAMA_FTYPE_ALL_F32: return "all F32";
93-
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
94-
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
95-
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
96-
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
97-
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
98-
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
99-
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
100-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
101-
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
102-
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
103-
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
104-
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
105-
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
106-
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
107-
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
108-
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
109-
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
110-
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
111-
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
112-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
113-
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
114-
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
115-
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
116-
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
117-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
118-
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
119-
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
120-
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
121-
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
122-
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
123-
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
124-
125-
default: return "unknown, may not work";
126-
}
127-
}
128-
12982
static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
13083
switch (type) {
13184
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
@@ -142,10 +95,6 @@ std::string llama_model::type_name() const {
14295
return llm_type_name(type);
14396
}
14497

145-
std::string llama_model::ftype_name() const {
146-
return llama_model_ftype_name(ftype);
147-
}
148-
14998
template<typename F>
15099
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
151100
ggml_init_params params = {
@@ -208,6 +157,10 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
208157
return it->second;
209158
}
210159

160+
size_t llama_model::size() const {
161+
return n_bytes;
162+
}
163+
211164
size_t llama_model::max_nodes() const {
212165
return std::max<size_t>(8192, tensors_by_name.size()*5);
213166
}
@@ -1100,7 +1053,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
11001053
default: throw std::runtime_error("unsupported model architecture");
11011054
}
11021055

1103-
ftype = ml.ftype;
1056+
n_bytes = ml.n_bytes;
1057+
1058+
desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
11041059

11051060
if (hparams.f_max_alibi_bias > 0.0f) {
11061061
hparams.use_alibi = true;
@@ -1820,7 +1775,11 @@ void llama_model::load_vocab(llama_model_loader & ml) {
18201775
}
18211776
}
18221777

1823-
void llama_model::print_meta(llama_model_loader & ml) {
1778+
std::string llama_model::desc() const {
1779+
return desc_str;
1780+
}
1781+
1782+
void llama_model::print_info() const {
18241783
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
18251784

18261785
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
@@ -1853,7 +1812,6 @@ void llama_model::print_meta(llama_model_loader & ml) {
18531812
};
18541813

18551814
// hparams
1856-
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
18571815
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
18581816
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
18591817
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
@@ -1897,20 +1855,14 @@ void llama_model::print_meta(llama_model_loader & ml) {
18971855
}
18981856

18991857
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
1900-
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, ftype_name().c_str());
1901-
if (ml.n_elements >= 1e12) {
1902-
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
1903-
} else if (ml.n_elements >= 1e9) {
1904-
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
1905-
} else if (ml.n_elements >= 1e6) {
1906-
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
1907-
} else {
1908-
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
1909-
}
1910-
if (ml.n_bytes < GiB) {
1911-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1858+
if (n_elements >= 1e12) {
1859+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, n_elements*1e-12);
1860+
} else if (n_elements >= 1e9) {
1861+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, n_elements*1e-9);
1862+
} else if (n_elements >= 1e6) {
1863+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, n_elements*1e-6);
19121864
} else {
1913-
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1865+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, n_elements*1e-3);
19141866
}
19151867

19161868
// general kv
@@ -2154,14 +2106,11 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
21542106
}
21552107

21562108
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
2157-
return snprintf(buf, buf_size, "%s %s %s",
2158-
model->arch_name ().c_str(),
2159-
model->type_name ().c_str(),
2160-
model->ftype_name().c_str());
2109+
return snprintf(buf, buf_size, "%s", model->desc().c_str());
21612110
}
21622111

21632112
uint64_t llama_model_size(const struct llama_model * model) {
2164-
return model->n_bytes;
2113+
return model->size();
21652114
}
21662115

21672116
uint64_t llama_model_n_params(const struct llama_model * model) {

src/llama-model.h

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,10 @@ struct llama_model {
286286
llm_type type = MODEL_UNKNOWN;
287287
llm_arch arch = LLM_ARCH_UNKNOWN;
288288

289-
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
290-
291289
std::string name = "n/a";
292290

293291
llama_hparams hparams = {};
294-
llama_vocab vocab;
292+
llama_vocab vocab = {};
295293

296294
struct ggml_tensor * tok_embd = nullptr;
297295
struct ggml_tensor * type_embd = nullptr;
@@ -320,6 +318,7 @@ struct llama_model {
320318
std::unordered_map<std::string, std::string> gguf_kv;
321319

322320
llama_split_mode split_mode;
321+
323322
int main_gpu;
324323
int n_gpu_layers;
325324

@@ -328,7 +327,6 @@ struct llama_model {
328327
// list of devices used in this model
329328
std::vector<ggml_backend_dev_t> devices;
330329

331-
332330
// lists of buffer types used for each layer
333331
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
334332
buft_list_t cpu_buft_list;
@@ -365,27 +363,30 @@ struct llama_model {
365363
// total number of parameters in the model
366364
uint64_t n_elements = 0;
367365

368-
// total size of all the tensors in the model in bytes
369-
size_t n_bytes = 0;
370-
371-
std::string arch_name() const;
372-
std::string type_name() const;
373-
std::string ftype_name() const;
374-
375366
ggml_backend_buffer_type_t select_buft(int il) const;
376367

377368
const struct ggml_tensor * get_tensor(const char * name) const;
378369

379-
size_t max_nodes() const;
380-
381370
void load_stats (llama_model_loader & ml);
382371
void load_arch (llama_model_loader & ml);
383372
void load_hparams(llama_model_loader & ml);
384373
void load_vocab (llama_model_loader & ml);
385374

386-
void print_meta(llama_model_loader & ml);
375+
std::string arch_name() const;
376+
std::string type_name() const;
377+
378+
std::string desc() const;
379+
380+
size_t size() const;
381+
size_t max_nodes() const;
382+
383+
void print_info() const;
387384

388385
private:
386+
size_t n_bytes = 0;
387+
388+
std::string desc_str;
389+
389390
std::string token_to_piece(llama_token token, bool special) const;
390391
};
391392

src/llama-quant.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -537,7 +537,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
537537
struct quantize_state_impl qs(model, params);
538538

539539
if (params->only_copy) {
540-
ftype = model.ftype;
540+
ftype = ml.ftype;
541541
}
542542
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
543543
if (params->imatrix) {

src/llama.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2462,6 +2462,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
24622462
try {
24632463
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
24642464

2465+
ml.print_info();
2466+
24652467
model.hparams.vocab_only = params.vocab_only;
24662468

24672469
try {
@@ -2481,7 +2483,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
24812483
}
24822484

24832485
model.load_stats(ml);
2484-
model.print_meta(ml);
2486+
model.print_info();
24852487

24862488
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
24872489
model.hparams.n_vocab != model.vocab.id_to_token.size()) {

0 commit comments

Comments
 (0)