44"""
55import os
66import sys
7+ import jinja2
78import requests
89import json
10+ from transformers import AutoTokenizer
911
1012
1113# Map an AutoGuess name to a HuggingFace model ID
1517 "ChatML (Qwen 2.5 based)" : "Qwen/Qwen2.5-0.5B-Instruct" ,
1618 "ChatML (Kimi)" : "moonshotai/Kimi-K2-Instruct" ,
1719 "Google Gemma 2" : "Efficient-Large-Model/gemma-2-2b-it" ,
18- "Google Gemma 3" : "scb10x/typhoon2.1-gemma3-12b " ,
20+ "Google Gemma 3" : "google/gemma-3-4b-it " ,
1921 "Google Gemma 3n" : "lmstudio-community/gemma-3n-E4B-it-MLX-bf16" ,
2022 "Llama 3.x" : "Steelskull/L3.3-Shakudo-70b" ,
21- "Llama 4" : "meta-llama /Llama-4-Scout-17B-16E-Instruct" ,
23+ "Llama 4" : "nvidia /Llama-4-Scout-17B-16E-Instruct-FP8 " ,
2224 "Mistral V7 (with system prompt)" : "Doctor-Shotgun/MS3.2-24B-Magnum-Diamond" ,
2325 "Mistral V3" : "mistralai/Mistral-7B-Instruct-v0.3" ,
2426 "GLM-4" : "THUDM/glm-4-9b-chat-hf" ,
3335 "ChatML (Generic)" : "NewEden/Gemma-27B-chatml" ,
3436}
3537
38+ AUTOGUESS_SKIP_ADAPTER_TESTS = {
39+ "Mistral V3" : {"system" }, # Poor system support
40+ "Mistral (Generic)" : {"system" }, # Poor system support
41+ }
42+
3643# User may be running this test from ./ or from ../ -- we want to be in ./ (i.e. tests)
3744if os .path .exists ("tests" ):
3845 os .chdir ("tests" )
@@ -46,6 +53,11 @@ def get_tokenizer_config_for_huggingface_model_id(huggingface_model_id: str):
4653 with open (fname ) as f :
4754 return json .load (f )
4855
56+ fname = f"gated-tokenizers/tokenizer_configs/{ huggingface_model_id .replace ('/' ,'_' )} /tokenizer_config.json"
57+ if os .path .exists (fname ):
58+ with open (fname ) as f :
59+ return json .load (f )
60+
4961 for filename in ["tokenizer_config.json" , "chat_template.json" ]:
5062 url = f"https://huggingface.co/{ huggingface_model_id } /resolve/main/{ filename } "
5163 response = requests .get (url )
@@ -55,7 +67,13 @@ def get_tokenizer_config_for_huggingface_model_id(huggingface_model_id: str):
5567 return v
5668 raise ValueError (f"Failed to fetch tokenizer config for { huggingface_model_id } ." )
5769
58- def match_chat_template_to_adapter (chat_template : str | list ) -> tuple [str , str | None ]| None :
70+ def get_tokenizer_for_huggingface_model_id (huggingface_model_id : str ):
71+ dname = f"gated-tokenizers/tokenizer_configs/{ huggingface_model_id .replace ('/' ,'_' )} "
72+ if os .path .exists (dname ):
73+ return AutoTokenizer .from_pretrained (dname , trust_remote_code = True )
74+ return AutoTokenizer .from_pretrained (huggingface_model_id , trust_remote_code = True )
75+
76+ def match_chat_template_to_adapter (chat_template : str | list ) -> tuple [dict , str | None ]| None :
5977 # Additional code in tester not present in application: support for multiple chat templates, and use default if present
6078 sub_template : str | None = None
6179 if isinstance (chat_template , list ):
@@ -74,7 +92,48 @@ def match_chat_template_to_adapter(chat_template: str|list) -> tuple[str, str|No
7492 if chat_template != "" :
7593 for entry in autoguess :
7694 if all (s in chat_template for s in entry ['search' ]):
77- return entry ['name' ], sub_template
95+ return entry , sub_template
96+
97+ def test_tokenizer_with_adapter (tokenizer , adapter : dict [str , str ], skip : set ) -> tuple [bool , str | None ]:
98+ """
99+ See if the adapter correctly reflects the tokenizer chat template.
100+ """
101+ def adapter_wrap (role , content ):
102+ return adapter [f"{ role } _start" ] + content + adapter [f"{ role } _end" ]
103+ def system (content ): return adapter_wrap ("system" , content )
104+ def user (content ): return adapter_wrap ("user" , content )
105+ def assistant (content ): return adapter_wrap ("assistant" , content )
106+ def templ (rolelist ):
107+ return tokenizer .apply_chat_template (rolelist , tokenize = False )
108+
109+ try :
110+ # We skip system checks if user and system are identical, or if in skip
111+ if "system" not in skip and user ("x" ) != system ("x" ):
112+ # Test system
113+ expect = system ("SyS-tEm" )
114+ templated = templ ([{"role" : "system" , "content" : "SyS-tEm" }, {"role" : "user" , "content" : "user" }])
115+ if expect not in templated :
116+ return False , f"system role missing expected fragment { expect .replace ("\n " , "\\ n" )} : { templated .replace ("\n " , "\\ n" )} "
117+
118+ # Test user/asst/usernvidia/Llama-4-Scout-17B-16E-Instruct-FP8
119+ expect = [
120+ user ("user_1" ),
121+ assistant ("asst_1" ),
122+ user ("user_2" )
123+ ]
124+ templated = templ ([
125+ {"role" :"user" , "content" : "user_1" },
126+ {"role" :"assistant" , "content" : "asst_1" },
127+ {"role" :"user" , "content" : "user_2" },
128+ ])
129+ rem = templated
130+ for sub in expect :
131+ if sub not in rem :
132+ return False , f"missing expected fragment { sub .replace ("\n " , "\\ n" )} : { rem .replace ("\n " , "\\ n" )} "
133+ rem = rem .split (sub , 1 )[1 ]
134+ except jinja2 .exceptions .TemplateError as e :
135+ return False , f"template error: { e } "
136+ return True , None
78137
79138failures = 0
80139seen = set ()
@@ -87,14 +146,21 @@ def match_chat_template_to_adapter(chat_template: str|list) -> tuple[str, str|No
87146 continue
88147 tokenizer_config = get_tokenizer_config_for_huggingface_model_id (huggingface_model_id )
89148 assert 'chat_template' in tokenizer_config
90- matched = match_chat_template_to_adapter (tokenizer_config ['chat_template' ])
91- if matched is None :
92- matched , sub_template = "MISSING MAPPING" , None
149+ match = match_chat_template_to_adapter (tokenizer_config ['chat_template' ])
150+ if match is None :
151+ matched , sub_template , adapter = "MISSING" , None , None
93152 else :
94- matched , sub_template = matched
153+ match , sub_template = match
154+ matched = match ['name' ]
155+ adapter = match ['adapter' ]
95156 sub_template = f"[{ sub_template } ]" if sub_template else ""
96- print (namefmt .format (name = name ) + " = " + namefmt .format (name = matched ) + " : " + ("OK " if name == matched else "FAILURE" ) + " " + hmifmt .format (huggingface_model_id = huggingface_model_id ) + " " + sub_template )
97- failures += name != matched
157+ adaptercheck , reason = False , '?'
158+ if name == matched :
159+ assert adapter
160+ tokenizer = get_tokenizer_for_huggingface_model_id (huggingface_model_id )
161+ adaptercheck , reason = test_tokenizer_with_adapter (tokenizer , adapter , AUTOGUESS_SKIP_ADAPTER_TESTS .get (name , set ()))
162+ print (namefmt .format (name = name ) + " = " + namefmt .format (name = matched ) + " : " + ("OK " if adaptercheck and name == matched else reason if not adaptercheck else "FAILURE" ) + " " + hmifmt .format (huggingface_model_id = huggingface_model_id ) + " " + sub_template )
163+ failures += name != matched or not adaptercheck
98164
99165for entry in autoguess :
100166 if entry ['name' ] not in seen :
0 commit comments