|
1 | | -# bert_base_cased_1_fp16_gpu.onnx |
| 1 | +# bert_base_cased_1_fp16_gpu.onnx and distilgpt2_1_fp16_gpu.onnx |
2 | 2 | -t f16 -transQ false -transK true -transV false -transO false -g 12 -seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 |
3 | 3 |
|
4 | | -# bert_large_uncased_1_fp16_gpu.onnx |
| 4 | +# bert_large_uncased_1_fp16_gpu.onnx and bert_large_mlperf |
5 | 5 | -t f16 -transQ false -transK true -transV false -transO false -g 16 -seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 |
6 | 6 |
|
7 | | -# distilgpt2_1_fp16_gpu.onnx |
8 | | --t f16 -transQ false -transK true -transV false -transO false -g 12 -seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 |
| 7 | +# distilgpt2_1 |
| 8 | +-t f32 -transQ false -transK true -transV false -transO false -g 12 -seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 |
9 | 9 |
|
10 | 10 | # stable-diffusion-2-onnx-unet |
11 | 11 | -t f16 -transQ false -transK true -transV false -transO false -g 10 -seq_len_q 4096 -seq_len_k 4096 -head_dim_qk 64 -head_dim_v 64 |
|
18 | 18 |
|
19 | 19 | # stable-diffusion-2-onnx vae_decoder |
20 | 20 | -t f16 -transQ false -transK false -transV false -transO false -g 1 -seq_len_q 4096 -seq_len_k 4096 -head_dim_qk 512 -head_dim_v 512 |
| 21 | +-t f16 -transQ false -transK true -transV false -transO false -g 1 -seq_len_q 4096 -seq_len_k 4096 -head_dim_qk 512 -head_dim_v 512 |
21 | 22 |
|
22 | | -# bert_large_mlperf.onnx |
23 | | --t f16 -transQ false -transK true -transV false -transO false -g 16 -seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 |
24 | | - |
25 | | -# qwen1.5-7b fp16 |
| 23 | +# qwen1.5-7b fp16, llama3_8b, mistral-7b |
26 | 24 | -t f16 -transQ false -transK false -transV false -transO false -g 32 -seq_len_q 256 -seq_len_k 256 -head_dim_qk 128 -head_dim_v 128 |
| 25 | +-t f16 -transQ false -transK true -transV false -transO false -g 32 -seq_len_q 256 -seq_len_k 256 -head_dim_qk 128 -head_dim_v 128 |
27 | 26 |
|
28 | 27 | # phi3 |
29 | 28 | -t f16 -transQ false -transK false -transV false -transO false -g 32 -seq_len_q 256 -seq_len_k 256 -head_dim_qk 96 -head_dim_v 96 |
30 | 29 |
|
31 | | -# llama3_8b model.onnx |
32 | | --t f16 -transQ false -transK false -transV false -transO false -g 32 -seq_len_q 256 -seq_len_k 256 -head_dim_qk 128 -head_dim_v 128 |
| 30 | +# phi3_3_8b |
| 31 | +-t f16 -transQ false -transK true -transV false -transO false -g 32 -seq_len_q 256 -seq_len_k 256 -head_dim_qk 96 -head_dim_v 96 |
33 | 32 |
|
34 | 33 | # whisper-large encoder_model.onnx |
35 | 34 | -t f32 -transQ false -transK true -transV false -transO false -g 20 -seq_len_q 1500 -seq_len_k 1500 -head_dim_qk 64 -head_dim_v 64 |
36 | 35 |
|
37 | | -# mistral-7b |
38 | | --t f16 -transQ false -transK false -transV false -transO false -g 32 -seq_len_q 256 -seq_len_k 256 -head_dim_qk 128 -head_dim_v 128 |
39 | | - |
40 | | -# Flux |
| 36 | +# Flux and sd3.5 text_encoder |
41 | 37 | -t f16 -transQ false -transK false -transV false -transO false -g 12 -seq_len_q 77 -seq_len_k 77 -head_dim_qk 64 -head_dim_v 64 |
42 | 38 |
|
43 | | -# sd3 text_encoder_3 |
44 | | --t f16 -transQ false -transK true -transV false -transO false -g 64 -seq_len_q 77 -seq_len_k 77 -head_dim_qk 64 -head_dim_v 64 |
| 39 | +# flux_text_encoder |
| 40 | +-t f16 -transQ false -transK true -transV false -transO false -g 12 -seq_len_q 77 -seq_len_k 77 -head_dim_qk 64 -head_dim_v 64 |
45 | 41 |
|
46 | | -# sd3.5 text_encoder |
47 | | --t f16 -transQ false -transK false -transV false -transO false -g 12 -seq_len_q 77 -seq_len_k 77 -head_dim_qk 64 -head_dim_v 64 |
| 42 | +# sd3 and sd3.5 text_encoder_3 |
| 43 | +-t f16 -transQ false -transK true -transV false -transO false -g 64 -seq_len_q 77 -seq_len_k 77 -head_dim_qk 64 -head_dim_v 64 |
48 | 44 |
|
49 | 45 | # sd3.5 text_encoder_2 |
50 | 46 | -t f16 -transQ false -transK false -transV false -transO false -g 20 -seq_len_q 77 -seq_len_k 77 -head_dim_qk 64 -head_dim_v 64 |
51 | | - |
52 | | -# sd3.5 text_encoder_3 |
53 | | --t f16 -transQ false -transK true -transV false -transO false -g 64 -seq_len_q 77 -seq_len_k 77 -head_dim_qk 64 -head_dim_v 64 |
| 47 | +-t f16 -transQ false -transK true -transV false -transO false -g 20 -seq_len_q 77 -seq_len_k 77 -head_dim_qk 64 -head_dim_v 64 |
54 | 48 |
|
55 | 49 | # sd3_medium_vae_encoder |
56 | 50 | -t f32 -transQ false -transK false -transV false -transO false -g 2 -seq_len_q 64 -seq_len_k 64 -head_dim_qk 512 -head_dim_v 512 |
| 51 | +-t f32 -transQ false -transK true -transV false -transO false -g 1 -seq_len_q 64 -seq_len_k 64 -head_dim_qk 512 -head_dim_v 512 |
| 52 | + |
| 53 | +# llama2-7b-chat-hf-awq-int4-asym-gs128-onnx_prefill |
| 54 | +-t f16 -transQ false -transK true -transV false -transO false -causal true -return_lse false -g 32 -seq_len_q 4096 -seq_len_k 4096 -head_dim_qk 128 -head_dim_v 128 |
| 55 | + |
| 56 | +# llama-2-7b-chat-hf-awq-int4-asym-gs128-onnx_decode |
| 57 | +-t f16 -transQ false -transK true -transV false -transO false -causal false -return_lse false -g 32 -seq_len_q 1 -seq_len_k 4096 -head_dim_qk 128 -head_dim_v 128 |
| 58 | + |
| 59 | +# llama3_8b_kv_cache_prefill |
| 60 | +-t f32 -transQ false -transK true -transV false -transO false -causal true -return_lse false -g 32 -seq_len_q 4096 -seq_len_k 4096 -head_dim_qk 128 -head_dim_v 128 |
| 61 | + |
| 62 | +# mistral_8b_kv_cache_decode |
| 63 | +-t f32 -transQ false -transK true -transV false -transO false -g 32 -seq_len_q 1 -seq_len_k 4097 -head_dim_qk 128 -head_dim_v 128 |
0 commit comments