Skip to content

Commit 7ad0f37

Browse files
committed
feat(llama-quant): Allow F16 and BF16 quants of ssm_conv1d.weight
This is experimantal! Branch: Mamba2SSD Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 82bba1d commit 7ad0f37

File tree

1 file changed

+13
-2
lines changed

1 file changed

+13
-2
lines changed

src/llama-quant.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,18 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
421421
}
422422
++qs.i_ffn_up;
423423
}
424+
else if (name.find("ssm_conv1d") != std::string::npos) {
425+
// go as low as F16 for now
426+
switch (ftype) {
427+
case LLAMA_FTYPE_ALL_F32:
428+
case LLAMA_FTYPE_MOSTLY_BF16:
429+
break;
430+
default:
431+
{
432+
new_type = GGML_TYPE_F16;
433+
}
434+
}
435+
}
424436

425437
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
426438
//}
@@ -859,9 +871,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
859871
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
860872
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
861873

862-
// do not quantize Mamba's small yet 2D weights
874+
// do not quantize shortconv 2D weights
863875
// NOTE: can't use LLM_TN here because the layer number is not known
864-
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
865876
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
866877

867878
// do not quantize RWKV's small yet 2D weights

0 commit comments

Comments
 (0)