Skip to content

Commit 0941092

Browse files
committed
Set is_recurrent from h_head_kv
1 parent 85c7986 commit 0941092

File tree

7 files changed

+7
-16
lines changed

7 files changed

+7
-16
lines changed

convert_hf_to_gguf.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6970,13 +6970,12 @@ def _add_feed_forward_length(self):
69706970

69716971

69726972
def set_gguf_parameters(self):
6973-
# set only for attention layers before calling super().set_gguf_parameters()
6973+
# set num_key_value_heads only for attention layers
69746974
self.hparams["num_key_value_heads"] = [(self.hparams["num_key_value_heads"] if x in self.hparams["full_attn_idxs"] else 0) for x in range(self.block_count)]
69756975

69766976
super().set_gguf_parameters()
69776977
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
69786978
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
6979-
self.gguf_writer.add_is_recurrent_layer([x not in self.hparams["full_attn_idxs"] for x in range(self.block_count)])
69806979
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
69816980
self._add_feed_forward_length()
69826981

gguf-py/gguf/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,6 @@ class LLM:
122122
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
123123
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
124124
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
125-
IS_RECURRENT_LAYER = "{arch}.is_recurrent_layer"
126125

127126
class Attention:
128127
HEAD_COUNT = "{arch}.attention.head_count"
@@ -2335,6 +2334,7 @@ class MODEL_TENSOR(IntEnum):
23352334
MODEL_TENSOR.FFN_GATE,
23362335
MODEL_TENSOR.FFN_DOWN,
23372336
MODEL_TENSOR.FFN_UP,
2337+
],
23382338
MODEL_ARCH.LFM2: [
23392339
MODEL_TENSOR.TOKEN_EMBD,
23402340
MODEL_TENSOR.TOKEN_EMBD_NORM,

gguf-py/gguf/gguf_writer.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -651,9 +651,6 @@ def add_convnext_block_count(self, length: int) -> None:
651651
def add_shortconv_l_cache(self, length: int) -> None:
652652
self.add_uint32(Keys.ShortConv.L_CACHE.format(arch=self.arch), length)
653653

654-
def add_is_recurrent_layer(self, value: Sequence[bool]) -> None:
655-
self.add_array(Keys.LLM.IS_RECURRENT_LAYER.format(arch=self.arch), value)
656-
657654
def add_block_count(self, length: int) -> None:
658655
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
659656

src/llama-arch.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
191191

192192
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
193193

194-
{ LLM_KV_IS_RECURRENT_LAYER, "%s.is_recurrent_layer" },
195-
196194
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
197195
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
198196
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },

src/llama-arch.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,6 @@ enum llm_kv {
230230

231231
LLM_KV_SHORTCONV_L_CACHE,
232232

233-
LLM_KV_IS_RECURRENT_LAYER,
234-
235233
// deprecated:
236234
LLM_KV_TOKENIZER_PREFIX_ID,
237235
LLM_KV_TOKENIZER_SUFFIX_ID,

src/llama-model-loader.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -305,11 +305,10 @@ namespace GGUFMeta {
305305
case GGUF_TYPE_UINT32:
306306
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
307307
(std::is_same<T, uint32_t>::value)); break;
308-
case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same<T, bool>::value)); break;
309308
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
310309
case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
311310
default:
312-
throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str()));
311+
throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
313312
}
314313

315314
if constexpr (std::is_same<T, std::string>::value) {
@@ -347,11 +346,10 @@ namespace GGUFMeta {
347346
case GGUF_TYPE_UINT32:
348347
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
349348
(std::is_same<T, uint32_t>::value)); break;
350-
case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same<T, bool>::value)); break;
351349
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
352350
case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
353351
default:
354-
throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str()));
352+
throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
355353
}
356354

357355
if (arr_info.length > N_MAX) {
@@ -466,7 +464,6 @@ namespace GGUFMeta {
466464
// TODO: this is not very clever - figure out something better
467465
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
468466
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
469-
template bool llama_model_loader::get_key_or_arr<std::array<bool, 512>>(enum llm_kv kid, std::array<bool, 512> & result, uint32_t n, bool required);
470467

471468
llama_model_loader::llama_model_loader(
472469
const std::string & fname,

src/llama-model.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
498498
hparams.n_head_kv_arr = hparams.n_head_arr;
499499

500500
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
501-
ml.get_key_or_arr(LLM_KV_IS_RECURRENT_LAYER, hparams.recurrent_layer_arr, hparams.n_layer, false);
502501

503502
bool rope_finetuned = false;
504503
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
@@ -1630,6 +1629,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
16301629
{
16311630
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
16321631
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1632+
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
1633+
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
1634+
}
16331635
switch (hparams.n_embd) {
16341636
case 1024: type = LLM_TYPE_350M; break;
16351637
case 1536: type = LLM_TYPE_700M; break;

0 commit comments

Comments
 (0)