Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <map>

static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
{ LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" },
Expand Down Expand Up @@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
};

static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
{
LLM_ARCH_CLIP,
{},
},
{
LLM_ARCH_LLAMA,
{
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//

enum llm_arch {
LLM_ARCH_CLIP,
LLM_ARCH_LLAMA,
LLM_ARCH_LLAMA4,
LLM_ARCH_DECI,
Expand Down
4 changes: 3 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_GENERAL_NAME, name, false);

// everything past this point is not vocab-related
if (hparams.vocab_only) {
// for CLIP models, we only need to load tensors, no hparams
if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
return;
}

Expand Down Expand Up @@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
llama_rope_type llama_model_rope_type(const llama_model * model) {
switch (model->arch) {
// these models do not use RoPE
case LLM_ARCH_CLIP:
case LLM_ARCH_GPT2:
case LLM_ARCH_GPTJ:
case LLM_ARCH_MPT:
Expand Down
8 changes: 7 additions & 1 deletion src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
});
}

bool is_clip_model = false;
for (const auto * it : tensors) {
const struct ggml_tensor * tensor = it->tensor;

Expand All @@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
}

is_clip_model |= name.find("mm.") != std::string::npos;
}

qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;

// sanity checks for models that have attention layers
if (qs.n_attention_wv != 0)
if (qs.n_attention_wv != 0 && !is_clip_model)
{
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
// attention layers have a non-zero number of kv heads
Expand Down Expand Up @@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;

// do not quantize specific multimodal tensors
quantize &= name.find(".position_embd.") == std::string::npos;

ggml_type new_type;
void * new_data;
size_t new_size;
Expand Down
3 changes: 3 additions & 0 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
} catch(const std::exception & e) {
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
}
if (model.arch == LLM_ARCH_CLIP) {
throw std::runtime_error("mmproj cannot be used as main model, use it with --mmproj instead");
}
try {
model.load_vocab(ml);
} catch(const std::exception & e) {
Expand Down
Loading