Skip to content

Commit 78b47bb

Browse files
author
ochafik
committed
fix test_calc_result
1 parent 326e700 commit 78b47bb

File tree

1 file changed

+23
-37
lines changed

1 file changed

+23
-37
lines changed

examples/server/tests/unit/test_tool_call.py

Lines changed: 23 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -341,43 +341,23 @@ def test_weather_tool_call(hf_repo: str, template_override: str | Tuple[str, str
341341

342342

343343
@pytest.mark.slow
344-
@pytest.mark.parametrize("n_predict,hf_repo,template_override", [
344+
@pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
345+
(None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
346+
(None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
347+
(None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
348+
(None, 128, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
349+
(None, 128, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
350+
(None, 128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
351+
(None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
345352
346-
(8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
347-
(8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
348-
349-
# (128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
350-
# (128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
351-
352-
(128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
353-
(128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
354-
355-
(128, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
356-
(128, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
357-
358-
(128, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
359-
(128, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
360-
361-
(128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
362-
(128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
363-
364-
(128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
365-
(128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
366-
367-
(128, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
368-
# (128, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
369-
370-
(128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
371-
# (128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
372-
373-
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
374-
(128, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
375-
376-
# Not working well w/ chatml + polyfill, which is forgiveable
377-
# (128, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
378-
# (128, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
353+
# TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
354+
("^So, 0\\.556442\\.", 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
355+
("[\\s\\S\\r\\n]*?\\b0\\.55644242476$", 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
356+
("^> 0.56$", 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
357+
("[\\s\\S\\r\\n]*?which equals 0\\.5\\.", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
358+
("**Answer:** 0\\.25\\b", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
379359
])
380-
def test_calc_result(n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
360+
def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
381361
global server
382362
# n_predict = 512
383363
server.n_slots = 1
@@ -403,6 +383,7 @@ def test_calc_result(n_predict: int, hf_repo: str, template_override: str | Tupl
403383
"content": None,
404384
"tool_calls": [
405385
{
386+
"id": "call_6789",
406387
"type": "function",
407388
"function": {
408389
"name": "calculate",
@@ -414,7 +395,8 @@ def test_calc_result(n_predict: int, hf_repo: str, template_override: str | Tupl
414395
{
415396
"role": "tool",
416397
"name": "calculate",
417-
"content": 0.55644242476
398+
"content": 0.55644242476,
399+
"tool_call_id": "call_6789",
418400
}
419401
],
420402
"tools": [
@@ -443,7 +425,11 @@ def test_calc_result(n_predict: int, hf_repo: str, template_override: str | Tupl
443425
assert tool_calls is None, f'Expected no tool call in {choice["message"]}'
444426
content = choice["message"].get("content")
445427
assert content is not None, f'Expected content in {choice["message"]}'
446-
assert re.match('^(The (y )?coordinate .*?is (approximately )?0.56[.]?|0.56)$', content), f'Expected something like "The y coordinate is 0.56.", got {content}'
428+
if result_override is not None:
429+
assert re.match(result_override, content), f'Expected {result_override}, got {content}'
430+
else:
431+
assert re.match('^[\\s\\S\\r\\n]*?The (y[ -])?coordinate [\\s\\S\\r\\n]*?is (approximately )?0\\.56\\b|^0\\.56$', content), \
432+
f'Expected something like "The y coordinate is 0.56.", got {content}'
447433

448434

449435
@pytest.mark.slow

0 commit comments

Comments
 (0)