From 41b098f12f96cd20f0962aabf71cc864d6c356d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 21 Jun 2025 08:15:16 +0200 Subject: [PATCH 1/3] fix Qwen3-Embedding eos token --- gguf-py/gguf/vocab.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index a792d56f0677d..34aa7bac08b9f 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -197,6 +197,16 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'): if not tokenizer_config: special_eos = special_last + elif special_last != special_eos: + if 'eot' not in self.special_token_types: + self.special_token_types = self.special_token_types + ('eot', ) + tokenizer_config['eot_token'] = special_eos + elif 'eom' not in self.special_token_types: + self.special_token_types = self.special_token_types + ('eom', ) + tokenizer_config['eom_token'] = special_eos + else: + logger.warning(f'Overriding special token {special_eos!r} with {special_last!r} without EOT/EOM fallback!') + tokenizer_config['eos_token'] = special_eos = special_last self.add_special_token['eos'] = True if special_last == special_eos else False if special_last != special_eos: logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing') From 2336167e60d21f4ff3ecdaa620dbddfc78a154ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 21 Jun 2025 08:22:01 +0200 Subject: [PATCH 2/3] typings fix --- gguf-py/gguf/vocab.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 34aa7bac08b9f..d2394e5f3f535 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -199,10 +199,10 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: special_eos = special_last elif special_last != special_eos: if 'eot' not in self.special_token_types: - self.special_token_types = self.special_token_types + ('eot', ) + self.special_token_types = tuple(self.special_token_types) + ('eot', ) tokenizer_config['eot_token'] = special_eos elif 'eom' not in self.special_token_types: - self.special_token_types = self.special_token_types + ('eom', ) + self.special_token_types = tuple(self.special_token_types) + ('eom', ) tokenizer_config['eom_token'] = special_eos else: logger.warning(f'Overriding special token {special_eos!r} with {special_last!r} without EOT/EOM fallback!') From 2fe5eb37bbe634657573db235ddc510670b2679e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 21 Jun 2025 09:13:44 +0200 Subject: [PATCH 3/3] nit [no ci] --- gguf-py/gguf/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index d2394e5f3f535..3b08f6134a67a 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -205,7 +205,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: self.special_token_types = tuple(self.special_token_types) + ('eom', ) tokenizer_config['eom_token'] = special_eos else: - logger.warning(f'Overriding special token {special_eos!r} with {special_last!r} without EOT/EOM fallback!') + logger.warning(f'Overriding EOS token {special_eos!r} with {special_last!r} without EOT/EOM fallback!') tokenizer_config['eos_token'] = special_eos = special_last self.add_special_token['eos'] = True if special_last == special_eos else False if special_last != special_eos: