Skip to content

Commit f172a27

Browse files
authored
add warnings
ggml-ci
1 parent 3350e4a commit f172a27

File tree

1 file changed

+20
-3
lines changed

1 file changed

+20
-3
lines changed

gguf-py/gguf/vocab.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -185,25 +185,42 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
185185
if not tokenizer_config:
186186
special_bos = special_first
187187
self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
188+
if special_first not in (special_bos, special_cls):
189+
logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
188190
if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
189191
if not tokenizer_config:
190192
special_eos = special_last
191193
self.add_special_token['eos'] = True if special_last == special_eos else False
194+
if special_last != special_eos:
195+
logger.warning(f'Unknown trailing special token {special_first!r} in TemplateProcessing<single>')
192196
if tmpl_pair:
193197
seq_start = 1 if tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
194198
seq_stop = -1 if tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
199+
if seq_start == 0 or seq_stop == None:
200+
logger.warning(f'TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
195201
if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
196202
tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
197203
tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
204+
if tmpl_a != 'A' or tmpl_b != 'B':
205+
logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
198206
# A [sep] [eos] B
199207
if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
200208
add_sep = False
201209
if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
202210
if special_entry in (special_sep, special_eos) and not special_last:
203211
add_sep = True
204-
if len(tmpl_pair) == 2 and (special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id')):
205-
if special_entry in (special_sep, special_eos):
206-
add_sep = True
212+
if special_entry not in (special_sep, special_eos):
213+
logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
214+
else:
215+
logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
216+
if len(tmpl_pair) == 2:
217+
if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
218+
if special_entry in (special_sep, special_eos):
219+
add_sep = True
220+
if special_entry not in (special_sep, special_eos):
221+
logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
222+
else:
223+
logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
207224
self.add_special_token['sep'] = add_sep
208225
if add_sep and not special_sep and tokenizer_config:
209226
tokenizer_config['sep_token'] = special_eos

0 commit comments

Comments
 (0)