diff --git a/Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml b/Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml index 231aeaf..50fc1bc 100644 --- a/Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml +++ b/Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml @@ -1,31 +1,5 @@ -# from gs://nm-vllm-certs/model-validation/lmeval/Qwen/Qwen2.5-7B/cuda/0.9.1.dev238+g922878ced/k8s-a100-duo/llm_eval_16218386266.json tasks: - - name: arc_challenge - metrics: - - name: acc_norm,none - value: 0.6373 - - name: gsm8k metrics: - name: exact_match,strict-match - value: 0.8074 - - - name: hellaswag - metrics: - - name: acc_norm,none - value: 0.8024 - - - name: mmlu - metrics: - - name: acc,none - value: 0.7424 - - - name: truthfulqa_mc2 - metrics: - - name: acc,none - value: 0.5633 - - - name: winogrande - metrics: - - name: acc,none - value: 0.7505 + value: 0.6944 diff --git a/Qwen/Qwen3-8B-FP8/accuracy/tasks.yml b/Qwen/Qwen3-8B-FP8/accuracy/tasks.yml new file mode 100644 index 0000000..c4a91b6 --- /dev/null +++ b/Qwen/Qwen3-8B-FP8/accuracy/tasks.yml @@ -0,0 +1,5 @@ +tasks: + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.8756 diff --git a/Qwen/Qwen3-8B-FP8/storage.yml b/Qwen/Qwen3-8B-FP8/storage.yml new file mode 100644 index 0000000..e31645c --- /dev/null +++ b/Qwen/Qwen3-8B-FP8/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/Qwen/Qwen3-8B +model: hf +data: hf diff --git a/RedHatAI/Kimi-K2-Instruct-quantized.w4a16/accuracy/server.yml b/RedHatAI/Kimi-K2-Instruct-quantized.w4a16/accuracy/server.yml index 51d25d0..22eafe9 100644 --- a/RedHatAI/Kimi-K2-Instruct-quantized.w4a16/accuracy/server.yml +++ b/RedHatAI/Kimi-K2-Instruct-quantized.w4a16/accuracy/server.yml @@ -1,3 +1,3 @@ trust-remote-code: true -tensor-parallel-size: 2 +tensor-parallel-size: 4 max-model-len: 16384 diff --git a/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/accuracy/tasks.yml b/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/accuracy/tasks.yml new file mode 100644 index 0000000..50fc1bc --- /dev/null +++ b/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/accuracy/tasks.yml @@ -0,0 +1,5 @@ +tasks: + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.6944 diff --git a/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/storage.yml b/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/storage.yml new file mode 100644 index 0000000..51a9ad6 --- /dev/null +++ b/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic +model: hf +data: hf diff --git a/RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic/accuracy/server.yml b/RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic/accuracy/server.yml new file mode 100644 index 0000000..de66392 --- /dev/null +++ b/RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic/accuracy/server.yml @@ -0,0 +1,7 @@ +enable-chunked-prefill: true +max-model-len: 9000 +tensor-parallel-size: 1 +trust-remote-code: true +tokenizer_mode: mistral +config_format: mistral +load_format: mistral diff --git a/RedHatAI/whisper-large-v3-turbo-FP8-dynamic/accuracy/server.yml b/RedHatAI/whisper-large-v3-turbo-FP8-dynamic/accuracy/server.yml new file mode 100644 index 0000000..860ae1f --- /dev/null +++ b/RedHatAI/whisper-large-v3-turbo-FP8-dynamic/accuracy/server.yml @@ -0,0 +1,2 @@ +max-model-len: 448 +trust-remote-code: true diff --git a/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/accuracy/tasks.yml b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/accuracy/tasks.yml new file mode 100644 index 0000000..14e1988 --- /dev/null +++ b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/accuracy/tasks.yml @@ -0,0 +1,5 @@ +tasks: + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.8271 diff --git a/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/storage.yml b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/storage.yml new file mode 100644 index 0000000..fb93360 --- /dev/null +++ b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/deepseek-ai/DeepSeek-R1-0528 +model: hf +data: hf diff --git a/distil-whisper/distil-large-v3/accuracy/server.yml b/distil-whisper/distil-large-v3/accuracy/server.yml new file mode 100644 index 0000000..4a4677e --- /dev/null +++ b/distil-whisper/distil-large-v3/accuracy/server.yml @@ -0,0 +1,4 @@ +max-model-len: 448 +tensor-parallel-size: 1 +trust-remote-code: true +uvicorn-log-level: debug diff --git a/ibm-granite/granite-4.0-h-small/accuracy/model_card_tasks.yml b/ibm-granite/granite-4.0-h-small/accuracy/model_card_tasks.yml new file mode 100644 index 0000000..795309c --- /dev/null +++ b/ibm-granite/granite-4.0-h-small/accuracy/model_card_tasks.yml @@ -0,0 +1,30 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.7278 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.85 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8370 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8067 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.7062 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8374 diff --git a/ibm-granite/granite-4.0-h-small/accuracy/tasks.yml b/ibm-granite/granite-4.0-h-small/accuracy/tasks.yml new file mode 100644 index 0000000..293aa1f --- /dev/null +++ b/ibm-granite/granite-4.0-h-small/accuracy/tasks.yml @@ -0,0 +1,60 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.715 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.85 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8573 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8109 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.6409 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8374 + + - name: leaderboard_ifeval + metrics: + - name: inst_level_strict_acc,none + value: 0.729 + + # - name: leaderboard_bbh + # metrics: + # - name: acc-norm,none + # value: 0.5319 + + # - name: leaderboard_math_hard + # metrics: + # - name: exact_match,none + # value: 0.1477 + + # - name: leaderboard_gpqa + # metrics: + # - name: acc-norm,none + # value: 0.3176 + + # - name: leaderboard_musr + # metrics: + # - name: acc-norm,none + # value: 0.4601 + + - name: leaderboard_mmlu_pro + metrics: + - name: acc,none + value: 0.5545 diff --git a/ibm-granite/granite-4.0-h-small/storage.yml b/ibm-granite/granite-4.0-h-small/storage.yml new file mode 100644 index 0000000..1f29c96 --- /dev/null +++ b/ibm-granite/granite-4.0-h-small/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/ibm-granite/granite-3.1-8b-instruct +model: hf +data: hf diff --git a/ibm-granite/granite-4.0-micro/accuracy/model_card_tasks.yml b/ibm-granite/granite-4.0-micro/accuracy/model_card_tasks.yml new file mode 100644 index 0000000..76bbb3b --- /dev/null +++ b/ibm-granite/granite-4.0-micro/accuracy/model_card_tasks.yml @@ -0,0 +1,30 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.7278 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.79 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8370 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8067 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.7062 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8374 diff --git a/ibm-granite/granite-4.0-micro/accuracy/tasks.yml b/ibm-granite/granite-4.0-micro/accuracy/tasks.yml new file mode 100644 index 0000000..3640c76 --- /dev/null +++ b/ibm-granite/granite-4.0-micro/accuracy/tasks.yml @@ -0,0 +1,60 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.715 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.79 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8573 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8109 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.6409 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8374 + + - name: leaderboard_ifeval + metrics: + - name: inst_level_strict_acc,none + value: 0.729 + + # - name: leaderboard_bbh + # metrics: + # - name: acc-norm,none + # value: 0.5319 + + # - name: leaderboard_math_hard + # metrics: + # - name: exact_match,none + # value: 0.1477 + + # - name: leaderboard_gpqa + # metrics: + # - name: acc-norm,none + # value: 0.3176 + + # - name: leaderboard_musr + # metrics: + # - name: acc-norm,none + # value: 0.4601 + + - name: leaderboard_mmlu_pro + metrics: + - name: acc,none + value: 0.5545 diff --git a/ibm-granite/granite-4.0-micro/storage.yml b/ibm-granite/granite-4.0-micro/storage.yml new file mode 100644 index 0000000..1f29c96 --- /dev/null +++ b/ibm-granite/granite-4.0-micro/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/ibm-granite/granite-3.1-8b-instruct +model: hf +data: hf diff --git a/microsoft/Phi-3-medium-4k-instruct/accuracy/model_card_tasks.yml b/microsoft/Phi-3-medium-4k-instruct/accuracy/model_card_tasks.yml new file mode 100644 index 0000000..50fc02e --- /dev/null +++ b/microsoft/Phi-3-medium-4k-instruct/accuracy/model_card_tasks.yml @@ -0,0 +1,30 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.6442 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.85 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8437 + + - name: mmlu + metrics: + - name: acc,none + value: 0.803 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.5937 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8058 diff --git a/microsoft/Phi-3-medium-4k-instruct/accuracy/tasks.yml b/microsoft/Phi-3-medium-4k-instruct/accuracy/tasks.yml new file mode 100644 index 0000000..36dff96 --- /dev/null +++ b/microsoft/Phi-3-medium-4k-instruct/accuracy/tasks.yml @@ -0,0 +1,55 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.6825 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.82 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8435 + + - name: mmlu + metrics: + - name: acc,none + value: 0.803 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.5934 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8011 + + - name: leaderboard_ifeval + metrics: + - name: inst_level_strict_acc,none + value: 0.0587 + + - name: leaderboard_gpqa_diamond + metrics: + - name: acc-norm,none + value: 0.3939 + + - name: leaderboard_gpqa_extended + metrics: + - name: acc-norm,none + value: 0.3882 + + - name: leaderboard_gpqa_main + metrics: + - name: acc-norm,none + value: 0.4129 + + - name: leaderboard_mmlu_pro + metrics: + - name: acc,none + value: 0.53 diff --git a/mistralai/Voxtral-Mini-3B-2507/accuracy/server.yml b/mistralai/Voxtral-Mini-3B-2507/accuracy/server.yml new file mode 100644 index 0000000..1c30da7 --- /dev/null +++ b/mistralai/Voxtral-Mini-3B-2507/accuracy/server.yml @@ -0,0 +1,7 @@ +enable-chunked-prefill: true +max-model-len: 9000 +tensor-parallel-size: 1 +trust-remote-code: true +tokenizer_mode: mistral +config_format: mistral +load_format: mistral diff --git a/mistralai/Voxtral-Small-24B-2507/performance/server.yml b/mistralai/Voxtral-Small-24B-2507/performance/server.yml new file mode 100644 index 0000000..452fa19 --- /dev/null +++ b/mistralai/Voxtral-Small-24B-2507/performance/server.yml @@ -0,0 +1,8 @@ +enable-chunked-prefill: true +max-model-len: 9000 +tensor-parallel-size: 1 +trust-remote-code: true +uvicorn-log-level: debug +tokenizer_mode: mistral +config_format: mistral +load_format: mistral diff --git a/mistralai/Voxtral-Small-24B-2507/storage.yml b/mistralai/Voxtral-Small-24B-2507/storage.yml new file mode 100644 index 0000000..fc064a6 --- /dev/null +++ b/mistralai/Voxtral-Small-24B-2507/storage.yml @@ -0,0 +1,3 @@ +# https://huggingface.co/mistralai/Voxtral-Small-24B-2507 +model: hf +data: hf diff --git a/openai/gpt-oss-120b/performance/server.yml b/openai/gpt-oss-120b/performance/server.yml new file mode 100644 index 0000000..45a0b3f --- /dev/null +++ b/openai/gpt-oss-120b/performance/server.yml @@ -0,0 +1,7 @@ +enable-chunked-prefill: true +max-model-len: 10000 +tensor-parallel-size: 2 +trust-remote-code: true +uvicorn-log-level: debug +tool-call-parser: openai +enable-auto-tool-choice: true diff --git a/openai/gpt-oss-20b/accuracy/tasks.yml b/openai/gpt-oss-20b/accuracy/tasks.yml index 0b4633c..cf07e78 100644 --- a/openai/gpt-oss-20b/accuracy/tasks.yml +++ b/openai/gpt-oss-20b/accuracy/tasks.yml @@ -2,4 +2,4 @@ tasks: - name: gsm8k metrics: - name: exact_match,strict-match - value: 0 + value: 0.2494 diff --git a/openai/gpt-oss-20b/performance/server.yml b/openai/gpt-oss-20b/performance/server.yml new file mode 100644 index 0000000..cf738a4 --- /dev/null +++ b/openai/gpt-oss-20b/performance/server.yml @@ -0,0 +1,8 @@ +enable-chunked-prefill: true +max-model-len: 10000 +tensor-parallel-size: 1 +trust-remote-code: true +uvicorn-log-level: debug +tool-call-parser: openai +enable-auto-tool-choice: true + diff --git a/openai/whisper-large-v3/accuracy/server.yml b/openai/whisper-large-v3/accuracy/server.yml new file mode 100644 index 0000000..c00b7fb --- /dev/null +++ b/openai/whisper-large-v3/accuracy/server.yml @@ -0,0 +1,3 @@ +max-model-len: 448 +tensor-parallel-size: 1 +trust-remote-code: true diff --git a/openai/whisper-large-v3/accuracy/tasks.yml b/openai/whisper-large-v3/accuracy/tasks.yml new file mode 100644 index 0000000..0b4633c --- /dev/null +++ b/openai/whisper-large-v3/accuracy/tasks.yml @@ -0,0 +1,5 @@ +tasks: + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0