Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 65 additions & 58 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4271,20 +4271,34 @@ struct llama_model_loader {

ggml_tensor * tensor;

llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
if (tensor_idx < 0) {
throw std::runtime_error(format("tensor '%s' not found in the model", name));
throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
}

offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
}
}
};
std::vector<llama_tensor_weight> weights;

// custom comparator to sort weights more nicely by layer
struct weight_name_comparer {
bool operator()(const std::string & a, const std::string & b) const {
int a_layer = -1;
int b_layer = -1;
sscanf(a.c_str(), "blk.%d.", &a_layer);
sscanf(b.c_str(), "blk.%d.", &b_layer);
if (a_layer != b_layer) {
return a_layer < b_layer;
}
return a < b;
}
};

std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;

struct gguf_context * meta = NULL;
Expand Down Expand Up @@ -4326,7 +4340,14 @@ struct llama_model_loader {
// For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
std::string tensor_name = std::string(cur->name);
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
}
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur));
}
uint16_t n_split = 0;
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
Expand Down Expand Up @@ -4366,7 +4387,14 @@ struct llama_model_loader {

// Save tensors data offset info of the shard.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
std::string tensor_name = std::string(cur->name);
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
}
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur));
}

gguf_free(ctx_gguf);
Expand All @@ -4376,7 +4404,7 @@ struct llama_model_loader {

// sanity check
{
const int n_tensors_loaded = (int) weights.size();
const int n_tensors_loaded = (int) weights_map.size();
if (n_tensors != n_tensors_loaded) {
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
}
Expand All @@ -4386,23 +4414,10 @@ struct llama_model_loader {
}

n_kv = gguf_get_n_kv(meta);
n_tensors = weights.size();
n_tensors = weights_map.size();

fver = (enum llama_fver) gguf_get_version(meta);

std::set<std::string> tensor_names;
for (auto & w : weights) {
n_elements += ggml_nelements(w.tensor);
n_bytes += ggml_nbytes(w.tensor);
// make sure there is no duplicated tensor names
const std::string name(w.tensor->name);
auto found = tensor_names.find(name);
if (found != tensor_names.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
}
tensor_names.insert(name);
}

LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));

Expand All @@ -4414,8 +4429,10 @@ struct llama_model_loader {
uint32_t n_type_max = 0;
enum ggml_type type_max = GGML_TYPE_F32;

for (int i = 0; i < n_tensors; i++) {
const ggml_tensor * tensor = weights.at(i).tensor;
for (const auto & it : weights_map) {
const llama_tensor_weight & w = it.second;
const ggml_tensor * tensor = w.tensor;

enum ggml_type type = tensor->type;

n_type[type]++;
Expand All @@ -4426,8 +4443,8 @@ struct llama_model_loader {
}

if (trace > 0) {
const uint16_t sid = weights.at(i).idx;
LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
const uint16_t sid = w.idx;
LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
}
}

Expand Down Expand Up @@ -4691,21 +4708,13 @@ struct llama_model_loader {
return llm_kv.arch;
}

const char * get_tensor_name(int i) const {
return weights.at(i).tensor->name;
}

const llama_tensor_weight * get_weight(const char * name) const {
for (const auto & weight : weights) {
if (strcmp(name, weight.tensor->name) == 0) {
return &weight;
}
auto pos = weights_map.find(name);
if (pos != weights_map.end()) {
return &pos->second;
}
return nullptr;
}

const llama_tensor_weight * get_weight(int i) const {
return get_weight(get_tensor_name(i));
return nullptr;
}

const llama_tensor_weight & require_weight(const char * name) const {
Expand All @@ -4732,10 +4741,6 @@ struct llama_model_loader {
return tensor;
}

struct ggml_tensor * get_tensor_meta(int i) const {
return get_tensor_meta(get_tensor_name(i));
}

const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());

Expand Down Expand Up @@ -4842,8 +4847,8 @@ struct llama_model_loader {
}

// compute the total size of all tensors for progress reporting
for (auto & w : weights) {
size_data += ggml_nbytes(w.tensor);
for (const auto & it : weights_map) {
size_data += ggml_nbytes(it.second.tensor);
}
}

Expand Down Expand Up @@ -18598,10 +18603,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
}

for (int i = 0; i < ml.n_tensors; ++i) {
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
for (const auto & it : ml.weights_map) {
const struct ggml_tensor * tensor = it.second.tensor;

const std::string name = ggml_get_name(meta);
const std::string name = ggml_get_name(tensor);

// TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos ||
Expand Down Expand Up @@ -18639,20 +18644,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<no_init<float>> f32_conv_buf;

uint16_t n_split = 1;
const auto & weights_map = ml.weights_map;

// Assume split index is continuous
if (params->keep_split) {
for (int i = 0; i < ml.n_tensors; ++i) {
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
for (const auto & it : weights_map) {
n_split = std::max(uint16_t(it.second.idx + 1), n_split);
}

}
std::vector<gguf_context*> ctx_outs(n_split, NULL);
ctx_outs[0] = ctx_out;

// populate the original tensors so we get an initial meta data
for (int i = 0; i < ml.n_tensors; ++i) {
auto weight = ml.get_weight(i);
uint16_t i_split = params->keep_split ? weight->idx : 0;
struct ggml_tensor * tensor = weight->tensor;
for (const auto & it : weights_map) {
uint16_t i_split = params->keep_split ? it.second.idx : 0;
struct ggml_tensor * tensor = it.second.tensor;
if (ctx_outs[i_split] == NULL) {
ctx_outs[i_split] = gguf_init_empty();
}
Expand Down Expand Up @@ -18699,12 +18706,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

const auto tn = LLM_TN(model.arch);
new_ofstream(0);
for (int i = 0; i < ml.n_tensors; ++i) {
auto weight = ml.get_weight(i);
struct ggml_tensor * tensor = weight->tensor;
if (weight->idx != cur_split && params->keep_split) {
for (const auto & it : weights_map) {
const auto & weight = it.second;
struct ggml_tensor * tensor = weight.tensor;
if (weight.idx != cur_split && params->keep_split) {
close_ofstream();
new_ofstream(weight->idx);
new_ofstream(weight.idx);
}

const std::string name = ggml_get_name(tensor);
Expand Down
Loading