From cb5d91b54d585a5159a93d20cb32cc1980254474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 24 May 2025 20:53:42 +0200 Subject: [PATCH 1/7] add missing mask token also correct obvious vocab padding error (most likely no actual change for any model out there, but at least makes sense now) --- convert_hf_to_gguf.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 123083b915412..e1c079766d384 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3665,7 +3665,7 @@ def _xlmroberta_set_vocab(self) -> None: tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = max(self.hparams.get('vocab_size', 0), tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -3690,14 +3690,6 @@ def _xlmroberta_set_vocab(self) -> None: scores[token_id] = score toktypes[token_id] = toktype - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - # realign tokens (see HF tokenizer code) tokens = [b'', b'', b'', b''] + tokens[3:-1] scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] @@ -3708,6 +3700,12 @@ def _xlmroberta_set_vocab(self) -> None: SentencePieceTokenTypes.UNKNOWN, ] + toktypes[3:-1] + if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE: + # Add mask token missing from sentencepiece.bpe.model + tokens[250001] = "" + scores[250001] = 0.0 + toktypes[250001] = SentencePieceTokenTypes.CONTROL + self.gguf_writer.add_tokenizer_model("t5") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) From c55c1baf358199de33124e7553cd77bf17c1cf81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 24 May 2025 20:55:11 +0200 Subject: [PATCH 2/7] set mask token lstrip attribute --- src/llama-vocab.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d5a036a8c4413..b51976699ca7b 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2080,9 +2080,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { std::string model_name; std::string tokenizer_pre; + std::string general_arch; ml.get_key(LLM_KV_GENERAL_NAME, model_name, false); ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); + ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false); // model name to lowercase std::transform(model_name.begin(), model_name.end(), model_name.begin(), @@ -2091,8 +2093,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } ); - // set attributes by model/tokenizer name - if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) { + // set attributes by model/tokenizer/architecture name + if (false + || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"}) + || _contains_any(general_arch, {"nomic-bert-moe"}) + ) { _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); } else if (_contains_any(model_name, {"phi-3", "phi3"})) { for (auto id : cache_special_tokens) { From 57c8470dc30f7e11b68ca2e6e6e5731719b74789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 24 May 2025 21:00:33 +0200 Subject: [PATCH 3/7] str -> bytes --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e1c079766d384..31505b9984398 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3702,7 +3702,7 @@ def _xlmroberta_set_vocab(self) -> None: if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE: # Add mask token missing from sentencepiece.bpe.model - tokens[250001] = "" + tokens[250001] = b'' scores[250001] = 0.0 toktypes[250001] = SentencePieceTokenTypes.CONTROL From 958eea664bdb751a918349cbbcabb1ff501b075a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 29 May 2025 22:29:34 +0200 Subject: [PATCH 4/7] remove dead padding code that got readded in merge --- convert_hf_to_gguf.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d7e1fd5442802..e9883e8eb8674 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3870,14 +3870,6 @@ def _xlmroberta_set_vocab(self) -> None: scores[token_id] = score toktypes[token_id] = toktype - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - if isinstance(tokenizer, SentencePieceProcessor): # realign tokens (see HF tokenizer code) tokens = [b'', b'', b'', b''] + tokens[3:-1] From 7789616e63f7d9b19d8c4dd936ebf63e9b7e5a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 1 Jun 2025 10:58:08 +0200 Subject: [PATCH 5/7] apply correct padding with AutoTokenizer as well --- convert_hf_to_gguf.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e9883e8eb8674..747410e0951c8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3807,7 +3807,7 @@ def _xlmroberta_set_vocab(self) -> None: remove_whitespaces = tokenizer.clean_up_tokenization_spaces precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size) + vocab_size = max(self.hparams.get('vocab_size', 0), tokenizer.vocab_size) else: sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) @@ -3851,24 +3851,24 @@ def _xlmroberta_set_vocab(self) -> None: unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) for token_id in range(vocab_size): - piece = tokenizer._convert_id_to_token(token_id) - text = piece.encode("utf-8") - score = tokenizer_json["model"]["vocab"][token_id][1] - - toktype = SentencePieceTokenTypes.NORMAL - if token_id == unk_token_id: - toktype = SentencePieceTokenTypes.UNKNOWN - elif token_id in tokenizer.all_special_ids: - toktype = SentencePieceTokenTypes.CONTROL - elif token_id in added_vocab.values(): - toktype = SentencePieceTokenTypes.USER_DEFINED - # No reliable way to detect this, but jina doesn't have any - # elif tokenizer.IsByte(token_id): - # toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype + if (piece := tokenizer._convert_id_to_token(token_id)) is not None: + text = piece.encode("utf-8") + score = tokenizer_json["model"]["vocab"][token_id][1] + + toktype = SentencePieceTokenTypes.NORMAL + if token_id == unk_token_id: + toktype = SentencePieceTokenTypes.UNKNOWN + elif token_id in tokenizer.all_special_ids: + toktype = SentencePieceTokenTypes.CONTROL + elif token_id in added_vocab.values(): + toktype = SentencePieceTokenTypes.USER_DEFINED + # No reliable way to detect this, but jina doesn't have any + # elif tokenizer.IsByte(token_id): + # toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype if isinstance(tokenizer, SentencePieceProcessor): # realign tokens (see HF tokenizer code) From a835a0c0b1665a347e22ce765ef9783a46f2e046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 1 Jun 2025 11:00:05 +0200 Subject: [PATCH 6/7] spaces-- --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 747410e0951c8..a46c20d14e368 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3854,7 +3854,7 @@ def _xlmroberta_set_vocab(self) -> None: if (piece := tokenizer._convert_id_to_token(token_id)) is not None: text = piece.encode("utf-8") score = tokenizer_json["model"]["vocab"][token_id][1] - + toktype = SentencePieceTokenTypes.NORMAL if token_id == unk_token_id: toktype = SentencePieceTokenTypes.UNKNOWN @@ -3865,7 +3865,7 @@ def _xlmroberta_set_vocab(self) -> None: # No reliable way to detect this, but jina doesn't have any # elif tokenizer.IsByte(token_id): # toktype = SentencePieceTokenTypes.BYTE - + tokens[token_id] = text scores[token_id] = score toktypes[token_id] = toktype From 4f2f228ecf7789eabc59d7b5667bc3d0dbb20709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 1 Jun 2025 17:28:57 +0200 Subject: [PATCH 7/7] only add mask token when using sentencepiece --- convert_hf_to_gguf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4e20bad5c0a6d..ec3b5697d8f6f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3889,11 +3889,11 @@ def _xlmroberta_set_vocab(self) -> None: SentencePieceTokenTypes.UNKNOWN, ] + toktypes[3:-1] - if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE: - # Add mask token missing from sentencepiece.bpe.model - tokens[250001] = b'' - scores[250001] = 0.0 - toktypes[250001] = SentencePieceTokenTypes.CONTROL + if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE: + # Add mask token missing from sentencepiece.bpe.model + tokens[250001] = b'' + scores[250001] = 0.0 + toktypes[250001] = SentencePieceTokenTypes.CONTROL self.gguf_writer.add_tokenizer_model("t5") self.gguf_writer.add_tokenizer_pre("default")