Skip to content

Commit 444f00b

Browse files
authored
llama : remove quantization sanity check (ggml-org#17788)
* llama : remove quantization sanity check This commit removes the quantization sanity check for attention layers. The motivation for this is that there are model that are hybrid models that have recurrent layers, experts layers, and attention layers. For these models the current check fails as the experts layers are not taking into account. After consideration, it was decided that this check is not strictly necessary, and can be removed to allow for more flexible model architectures. * llama : remove unused pruned_attention_w and is_clip_model vars
1 parent 2960eb2 commit 444f00b

File tree

1 file changed

+0
-27
lines changed

1 file changed

+0
-27
lines changed

src/llama-quant.cpp

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -666,19 +666,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
666666

667667
std::map<int, std::string> mapped;
668668
int blk_id = 0;
669-
int pruned_attention_w = 0;
670669

671670
// make a list of weights
672671
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
673672
tensors.reserve(ml.weights_map.size());
674673
for (const auto & it : ml.weights_map) {
675674
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
676675
if (remapped_name.empty()) {
677-
if (it.first.find("attn_v.weight") != std::string::npos ||
678-
it.first.find("attn_qkv.weight") != std::string::npos ||
679-
it.first.find("attn_kv_b.weight") != std::string::npos) {
680-
pruned_attention_w++;
681-
}
682676
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
683677
continue;
684678
}
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
703697
});
704698
}
705699

706-
bool is_clip_model = false;
707700
for (const auto * it : tensors) {
708701
const struct ggml_tensor * tensor = it->tensor;
709702

@@ -717,30 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
717710
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
718711
qs.has_output = true;
719712
}
720-
721-
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
722713
}
723714

724715
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
725716

726-
// sanity checks for models that have attention layers
727-
if (qs.n_attention_wv != 0 && !is_clip_model)
728-
{
729-
int32_t n_layer_all = model.hparams.n_layer;
730-
if (llama_model_has_encoder(&model)) {
731-
// now n_layer_all is the number of attention layers in the encoder
732-
// for each decoder block, there are 2 attention layers
733-
n_layer_all += 2 * model.hparams.dec_n_layer;
734-
}
735-
736-
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
737-
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
738-
739-
LLAMA_LOG_INFO("%s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_all, n_layer_recr, pruned_attention_w);
740-
741-
GGML_ASSERT((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
742-
}
743-
744717
size_t total_size_org = 0;
745718
size_t total_size_new = 0;
746719

0 commit comments

Comments
 (0)