1212 export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server
1313 export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp}
1414
15+ ./scripts/tool_bench.py run --n 10 --temp -1 --temp 0 --temp 1 --temp 2 --temp 5 --llama-baseline $PWD/buildMaster/bin/llama-server --output qwen14b.jsonl --hf bartowski/Qwen2.5-14B-Instruct-GGUF:Q4_K_L
1516 ./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 1.5B Q4_K_M" --output qwen1.5b.jsonl --hf bartowski/Qwen2.5-1.5B-Instruct-GGUF --ollama qwen2.5:1.5b-instruct-q4_K_M
1617 ./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 Coder 7B Q4_K_M" --output qwenc7b.jsonl --hf bartowski/Qwen2.5-Coder-7B-Instruct-GGUF --ollama qwen2.5-coder:7b
1718
@@ -205,6 +206,7 @@ def run(
205206 model : Annotated [Optional [str ], typer .Option (help = "Name of the model to test (server agnostic)" )] = None ,
206207 hf : Annotated [Optional [str ], typer .Option (help = "GGUF huggingface model repo id (+ optional quant) to test w/ llama-server" )] = None ,
207208 chat_template : Annotated [Optional [str ], typer .Option (help = "Chat template override for llama-server" )] = None ,
209+ chat_template_file : Annotated [Optional [str ], typer .Option (help = "Chat template file override for llama-server" )] = None ,
208210 ollama : Annotated [Optional [str ], typer .Option (help = "Ollama model tag to test" )] = None ,
209211 llama_baseline : Annotated [Optional [str ], typer .Option (help = "llama-server baseline binary path to use as baseline" )] = None ,
210212 n : Annotated [int , typer .Option (help = "Number of times to run each test" )] = 10 ,
@@ -229,6 +231,12 @@ def run(
229231 # n_ctx = 8192
230232 n_ctx = 2048
231233
234+ if model is None :
235+ if hf is not None :
236+ model = hf .split ("/" )[- 1 ]
237+ elif ollama is not None :
238+ model = ollama
239+
232240 assert force or append or not output .exists (), f"Output file already exists: { output } ; use --force to overwrite"
233241
234242 with output .open ('a' if append else 'w' ) as output_file :
@@ -320,6 +328,7 @@ def elapsed():
320328 server .model_hf_repo = hf
321329 server .model_hf_file = None
322330 server .chat_template = chat_template
331+ server .chat_template_file = chat_template_file
323332 server .server_path = server_path
324333 if port is not None :
325334 server .server_port = port
@@ -335,6 +344,7 @@ def elapsed():
335344 temp = t ,
336345 output_kwargs = dict (
337346 chat_template = chat_template ,
347+ chat_template_file = chat_template_file ,
338348 ),
339349 request_kwargs = dict (
340350 ignore_chat_grammar = ignore_chat_grammar ,
@@ -355,6 +365,7 @@ def elapsed():
355365 temp = t ,
356366 output_kwargs = dict (
357367 chat_template = None ,
368+ chat_template_file = None ,
358369 ),
359370 request_kwargs = dict (
360371 model = ollama ,
0 commit comments