@@ -185,25 +185,42 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
185185 if not tokenizer_config :
186186 special_bos = special_first
187187 self .add_special_token ['bos' ] = True if special_first in (special_bos , special_cls ) else False
188+ if special_first not in (special_bos , special_cls ):
189+ logger .warning (f'Unknown leading special token { special_first !r} in TemplateProcessing<single>' )
188190 if special_last := tmpl_single [- 1 ].get ('SpecialToken' , {}).get ('id' ):
189191 if not tokenizer_config :
190192 special_eos = special_last
191193 self .add_special_token ['eos' ] = True if special_last == special_eos else False
194+ if special_last != special_eos :
195+ logger .warning (f'Unknown trailing special token { special_first !r} in TemplateProcessing<single>' )
192196 if tmpl_pair :
193197 seq_start = 1 if tmpl_pair [0 ].get ('SpecialToken' , {}).get ('id' ) == special_first else 0
194198 seq_stop = - 1 if tmpl_pair [- 1 ].get ('SpecialToken' , {}).get ('id' ) == special_last else None
199+ if seq_start == 0 or seq_stop == None :
200+ logger .warning (f'TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>' )
195201 if tmpl_pair := tmpl_pair [slice (seq_start , seq_stop )]:
196202 tmpl_a = tmpl_pair [0 ].get ('Sequence' , {}).get ('id' )
197203 tmpl_b = tmpl_pair [- 1 ].get ('Sequence' , {}).get ('id' )
204+ if tmpl_a != 'A' or tmpl_b != 'B' :
205+ logger .warning (f'Unknown sequence { tmpl_a } ...{ tmpl_b } in TemplateProcessing<pair>' )
198206 # A [sep] [eos] B
199207 if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair [1 :- 1 ]):
200208 add_sep = False
201209 if special_entry := tmpl_pair [0 ].get ('SpecialToken' , {}).get ('id' ):
202210 if special_entry in (special_sep , special_eos ) and not special_last :
203211 add_sep = True
204- if len (tmpl_pair ) == 2 and (special_entry := tmpl_pair [1 ].get ('SpecialToken' , {}).get ('id' )):
205- if special_entry in (special_sep , special_eos ):
206- add_sep = True
212+ if special_entry not in (special_sep , special_eos ):
213+ logger .warning (f'Unknown separator token { special_entry !r} in TemplateProcessing<pair>' )
214+ else :
215+ logger .warning (f'Unknown middle sequence { tmpl_pair [0 ]!r} in TemplateProcessing<pair>' )
216+ if len (tmpl_pair ) == 2 :
217+ if special_entry := tmpl_pair [1 ].get ('SpecialToken' , {}).get ('id' ):
218+ if special_entry in (special_sep , special_eos ):
219+ add_sep = True
220+ if special_entry not in (special_sep , special_eos ):
221+ logger .warning (f'Unknown second separator token { special_entry !r} in TemplateProcessing<pair>' )
222+ else :
223+ logger .warning (f'Unknown second middle sequence { tmpl_pair [1 ]!r} in TemplateProcessing<pair>' )
207224 self .add_special_token ['sep' ] = add_sep
208225 if add_sep and not special_sep and tokenizer_config :
209226 tokenizer_config ['sep_token' ] = special_eos
0 commit comments