Skip to content
Open

New #52

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 1 addition & 27 deletions Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -1,31 +1,5 @@
# from gs://nm-vllm-certs/model-validation/lmeval/Qwen/Qwen2.5-7B/cuda/0.9.1.dev238+g922878ced/k8s-a100-duo/llm_eval_16218386266.json
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.6373

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.8074

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8024

- name: mmlu
metrics:
- name: acc,none
value: 0.7424

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.5633

- name: winogrande
metrics:
- name: acc,none
value: 0.7505
value: 0.6944
5 changes: 5 additions & 0 deletions Qwen/Qwen3-8B-FP8/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tasks:
- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.8756
3 changes: 3 additions & 0 deletions Qwen/Qwen3-8B-FP8/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/Qwen/Qwen3-8B
model: hf
data: hf
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
trust-remote-code: true
tensor-parallel-size: 2
tensor-parallel-size: 4
max-model-len: 16384
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tasks:
- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.6944
3 changes: 3 additions & 0 deletions RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic
model: hf
data: hf
7 changes: 7 additions & 0 deletions RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic/accuracy/server.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
enable-chunked-prefill: true
max-model-len: 9000
tensor-parallel-size: 1
trust-remote-code: true
tokenizer_mode: mistral
config_format: mistral
load_format: mistral
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
max-model-len: 448
trust-remote-code: true
5 changes: 5 additions & 0 deletions deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tasks:
- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.8271
3 changes: 3 additions & 0 deletions deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
model: hf
data: hf
4 changes: 4 additions & 0 deletions distil-whisper/distil-large-v3/accuracy/server.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
max-model-len: 448
tensor-parallel-size: 1
trust-remote-code: true
uvicorn-log-level: debug
30 changes: 30 additions & 0 deletions ibm-granite/granite-4.0-h-small/accuracy/model_card_tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.7278

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.85

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8370

- name: mmlu
metrics:
- name: acc,none
value: 0.8067

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.7062

- name: winogrande
metrics:
- name: acc,none
value: 0.8374
60 changes: 60 additions & 0 deletions ibm-granite/granite-4.0-h-small/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.715

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.85

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8573

- name: mmlu
metrics:
- name: acc,none
value: 0.8109

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.6409

- name: winogrande
metrics:
- name: acc,none
value: 0.8374

- name: leaderboard_ifeval
metrics:
- name: inst_level_strict_acc,none
value: 0.729

# - name: leaderboard_bbh
# metrics:
# - name: acc-norm,none
# value: 0.5319

# - name: leaderboard_math_hard
# metrics:
# - name: exact_match,none
# value: 0.1477

# - name: leaderboard_gpqa
# metrics:
# - name: acc-norm,none
# value: 0.3176

# - name: leaderboard_musr
# metrics:
# - name: acc-norm,none
# value: 0.4601

- name: leaderboard_mmlu_pro
metrics:
- name: acc,none
value: 0.5545
3 changes: 3 additions & 0 deletions ibm-granite/granite-4.0-h-small/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/ibm-granite/granite-3.1-8b-instruct
model: hf
data: hf
30 changes: 30 additions & 0 deletions ibm-granite/granite-4.0-micro/accuracy/model_card_tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.7278

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.79

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8370

- name: mmlu
metrics:
- name: acc,none
value: 0.8067

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.7062

- name: winogrande
metrics:
- name: acc,none
value: 0.8374
60 changes: 60 additions & 0 deletions ibm-granite/granite-4.0-micro/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.715

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.79

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8573

- name: mmlu
metrics:
- name: acc,none
value: 0.8109

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.6409

- name: winogrande
metrics:
- name: acc,none
value: 0.8374

- name: leaderboard_ifeval
metrics:
- name: inst_level_strict_acc,none
value: 0.729

# - name: leaderboard_bbh
# metrics:
# - name: acc-norm,none
# value: 0.5319

# - name: leaderboard_math_hard
# metrics:
# - name: exact_match,none
# value: 0.1477

# - name: leaderboard_gpqa
# metrics:
# - name: acc-norm,none
# value: 0.3176

# - name: leaderboard_musr
# metrics:
# - name: acc-norm,none
# value: 0.4601

- name: leaderboard_mmlu_pro
metrics:
- name: acc,none
value: 0.5545
3 changes: 3 additions & 0 deletions ibm-granite/granite-4.0-micro/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/ibm-granite/granite-3.1-8b-instruct
model: hf
data: hf
30 changes: 30 additions & 0 deletions microsoft/Phi-3-medium-4k-instruct/accuracy/model_card_tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.6442

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.85

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8437

- name: mmlu
metrics:
- name: acc,none
value: 0.803

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.5937

- name: winogrande
metrics:
- name: acc,none
value: 0.8058
55 changes: 55 additions & 0 deletions microsoft/Phi-3-medium-4k-instruct/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.6825

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.82

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8435

- name: mmlu
metrics:
- name: acc,none
value: 0.803

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.5934

- name: winogrande
metrics:
- name: acc,none
value: 0.8011

- name: leaderboard_ifeval
metrics:
- name: inst_level_strict_acc,none
value: 0.0587

- name: leaderboard_gpqa_diamond
metrics:
- name: acc-norm,none
value: 0.3939

- name: leaderboard_gpqa_extended
metrics:
- name: acc-norm,none
value: 0.3882

- name: leaderboard_gpqa_main
metrics:
- name: acc-norm,none
value: 0.4129

- name: leaderboard_mmlu_pro
metrics:
- name: acc,none
value: 0.53
7 changes: 7 additions & 0 deletions mistralai/Voxtral-Mini-3B-2507/accuracy/server.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
enable-chunked-prefill: true
max-model-len: 9000
tensor-parallel-size: 1
trust-remote-code: true
tokenizer_mode: mistral
config_format: mistral
load_format: mistral
8 changes: 8 additions & 0 deletions mistralai/Voxtral-Small-24B-2507/performance/server.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
enable-chunked-prefill: true
max-model-len: 9000
tensor-parallel-size: 1
trust-remote-code: true
uvicorn-log-level: debug
tokenizer_mode: mistral
config_format: mistral
load_format: mistral
3 changes: 3 additions & 0 deletions mistralai/Voxtral-Small-24B-2507/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# https://huggingface.co/mistralai/Voxtral-Small-24B-2507
model: hf
data: hf
7 changes: 7 additions & 0 deletions openai/gpt-oss-120b/performance/server.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
enable-chunked-prefill: true
max-model-len: 10000
tensor-parallel-size: 2
trust-remote-code: true
uvicorn-log-level: debug
tool-call-parser: openai
enable-auto-tool-choice: true
2 changes: 1 addition & 1 deletion openai/gpt-oss-20b/accuracy/tasks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ tasks:
- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0
value: 0.2494
Loading