Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ extern "C" {
LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

// Returns the total size of all the tensors in the model in bytes
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
LLAMA_API size_t llama_model_size(const struct llama_model * model);

// Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
Expand Down
33 changes: 19 additions & 14 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2907,9 +2907,15 @@ struct llama_model {
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;

int64_t t_load_us = 0;
int64_t t_load_us = 0;
int64_t t_start_us = 0;

// total number of parameters in the model
uint64_t n_elements = 0;

// total size of all the tensors in the model in bytes
size_t n_bytes = 0;

// keep track of loaded lora adapters
std::set<struct llama_lora_adapter *> lora_adapters;

Expand Down Expand Up @@ -4275,8 +4281,8 @@ struct llama_model_loader {
int n_tensors = 0;
int n_created = 0;

int64_t n_elements = 0;
size_t n_bytes = 0;
uint64_t n_elements = 0;
size_t n_bytes = 0;

bool use_mmap = false;
bool check_tensors;
Expand Down Expand Up @@ -5344,6 +5350,11 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
}
}

static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
model.n_elements = ml.n_elements;
model.n_bytes = ml.n_bytes;
}

static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
model.arch = ml.get_arch();
if (model.arch == LLM_ARCH_UNKNOWN) {
Expand Down Expand Up @@ -9256,6 +9267,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
}

llm_load_stats(ml, model);
llm_load_print_meta(ml, model);

if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
Expand Down Expand Up @@ -18601,6 +18613,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
llama_model model;
llm_load_arch(ml, model);
llm_load_hparams(ml, model);
llm_load_stats(ml, model);

struct quantize_state_internal qs(model, params);

Expand Down Expand Up @@ -19952,20 +19965,12 @@ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t bu
llama_model_ftype_name(model->ftype).c_str());
}

uint64_t llama_model_size(const struct llama_model * model) {
uint64_t size = 0;
for (const auto & it : model->tensors_by_name) {
size += ggml_nbytes(it.second);
}
return size;
size_t llama_model_size(const struct llama_model * model) {
return model->n_bytes;
}

uint64_t llama_model_n_params(const struct llama_model * model) {
uint64_t nparams = 0;
for (const auto & it : model->tensors_by_name) {
nparams += ggml_nelements(it.second);
}
return nparams;
return model->n_elements;
}

struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
Expand Down