Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <map>

static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
{ LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" },
Expand Down Expand Up @@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
};

static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
{
LLM_ARCH_CLIP,
{},
},
{
LLM_ARCH_LLAMA,
{
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//

enum llm_arch {
LLM_ARCH_CLIP,
LLM_ARCH_LLAMA,
LLM_ARCH_LLAMA4,
LLM_ARCH_DECI,
Expand Down
4 changes: 3 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_GENERAL_NAME, name, false);

// everything past this point is not vocab-related
if (hparams.vocab_only) {
// for CLIP models, we only need to load tensors, no hparams
if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
return;
}

Expand Down Expand Up @@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
llama_rope_type llama_model_rope_type(const llama_model * model) {
switch (model->arch) {
// these models do not use RoPE
case LLM_ARCH_CLIP:
case LLM_ARCH_GPT2:
case LLM_ARCH_GPTJ:
case LLM_ARCH_MPT:
Expand Down
8 changes: 7 additions & 1 deletion src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
});
}

bool is_clip_model = false;
for (const auto * it : tensors) {
const struct ggml_tensor * tensor = it->tensor;

Expand All @@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
}

is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
}

qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;

// sanity checks for models that have attention layers
if (qs.n_attention_wv != 0)
if (qs.n_attention_wv != 0 && !is_clip_model)
{
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
// attention layers have a non-zero number of kv heads
Expand Down Expand Up @@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;

// do not quantize specific multimodal tensors
quantize &= name.find(".position_embd.") == std::string::npos;

ggml_type new_type;
void * new_data;
size_t new_size;
Expand Down
3 changes: 3 additions & 0 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
} catch(const std::exception & e) {
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
}
if (model.arch == LLM_ARCH_CLIP) {
throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
}
try {
model.load_vocab(ml);
} catch(const std::exception & e) {
Expand Down
Loading