@@ -64,42 +64,42 @@ torch version 1.10.0+cu113, transformers version 4.12.5
64
64
65
65
| Model Size | Decode Strategy| FasterGeneration(FP32)<br >(ms) | FasterGeneration(FP16)<br >(ms) | HF generate<br >(ms) | Speed Up Rate<br >(Faster32/HF) | Speed Up Rate<br >(Faster16/HF) |
66
66
| -----| ----| ---| ---| ---| ---| ---|
67
- |num_layers = 6<br >num_attention_heads = 12<br >hidden_size = 768<br >(bart-base)|top_k = 1|30.08 |27.95|166.90|5.55 |5.97
68
- | | top_k = 4 | 30.82 | 30.01 | 184.58 | 5.99 | 6.15 |
69
- | | top_k = 8 | 32.06 | 31.05 | 183.44 | 5.72 | 5.91 |
70
- | | top_k = 16 | 32.66 | 32.35 | 187.14 | 5.73 | 5.78 |
71
- | | top_p = 0.4 | 37.99 | 30.25 | 208.33 | 5.48 | 6.89 |
72
- | | num_beams = 4| 45.99 | 37.51 | 285.01 | 5.43 | 7.6 |
73
- | | num_beams = 8| 50.12 | 37.82 | 316.56 | 6.32 | 8.37 |
74
- | | num_beams = 16| 67.66 | 40.98 | 467.76 | 6.91 | 11.41 |
75
- | num_layers = 12<br >num_attention_heads = 16<br >hidden_size = 1024<br >(bart-large)| top_k = 1| 50.23 | 39. 08| 222.59 | 4.43 | 5.7 |
76
- | | top_k = 4| 60.59 | 48.32 | 307.76 | 5.08 | 6.37 |
77
- | | top_k = 8| 59.67 | 49.65 | 310.49 | 5.20 | 6.25 |
78
- | | top_k = 16| 59.15 | 52.68 | 333.75 | 5.64 | 6.34 |
79
- | | top_p = 0.4| 61.36 | 50.83 | 340.74 | 5.55 | 6.7 |
80
- | | num_beams = 4| 65.60 | 53.24 | 336.28 | 5.12 | 6.32 |
81
- | | num_beams = 8| 76.20 | 54.13 | 396.62 | 5.20 | 7.33 |
82
- | | num_beams = 16| 102.04 | 61.11 | 531.92 | 5.21 | 8.7 |
67
+ |num_layers = 6<br >num_attention_heads = 12<br >hidden_size = 768<br >(bart-base)|top_k = 1|31.1 |27.4|139.46|4.48 |5.09
68
+ | | top_k = 4 | 32.13 | 29.06 | 149.81 | 4.66 | 5.16 |
69
+ | | top_k = 8 | 31.7 | 28.36 | 154.3 | 4.87 | 5.44 |
70
+ | | top_k = 16 | 32.93 | 28.66 | 145.85 | 4.43 | 5.09 |
71
+ | | top_p = 0.4 | 33.35 | 29.01 | 173.18 | 5.19 | 5.97 |
72
+ | | num_beams = 4| 47.55 | 38.02 | 252.71 | 5.31 | 6.65 |
73
+ | | num_beams = 8| 52.19 | 41.39 | 282.3 | 5.41 | 6.82 |
74
+ | | num_beams = 16| 67.18 | 45.82 | 441.59 | 6.57 | 9.64 |
75
+ | num_layers = 12<br >num_attention_heads = 16<br >hidden_size = 1024<br >(bart-large)| top_k = 1| 45.8 | 37.43 | 173. 08| 3.78 | 4.62 |
76
+ | | top_k = 4| 51.11 | 48.28 | 246.27 | 4.82 | 5.1 |
77
+ | | top_k = 8| 61.61 | 50.67 | 246.19 | 4.0 | 4.86 |
78
+ | | top_k = 16| 63.81 | 48.33 | 272.93 | 4.28 | 5.65 |
79
+ | | top_p = 0.4| 63.0 | 50.05 | 288.76 | 4.58 | 5.77 |
80
+ | | num_beams = 4| 65.54 | 48.58 | 273.84 | 4.18 | 5.64 |
81
+ | | num_beams = 8| 75.68 | 52.59 | 340.86 | 4.5 | 6.48 |
82
+ | | num_beams = 16| 102.87 | 62.25 | 477.97 | 4.65 | 7.68 |
83
83
84
84
** GPT:**
85
85
86
86
| Model Size | Decode Strategy| FasterGeneration(FP32)<br >(ms) | FasterGeneration(FP16)<br >(ms) | HF generate<br >(ms) | Speed Up Rate<br >(Faster32/HF) | Speed Up Rate<br >(Faster16/HF) |
87
87
| -----| ----| ---| ---| ---| ---| ---|
88
- | num_layers = 12<br >num_attention_heads = 12<br >hidden_size = 768<br >(gpt2)| top_k = 1| 49.75 | 40.15 | 483.02 | 9.71 | 12.03 |
89
- | | top_k = 4| 49.70 | 41.69 | 496.63 | 9.99 | 11.91 |
90
- | | top_k = 8| 51.81 | 40.81 | 485.77 | 9.38 | 11.9 |
91
- | | top_k = 16| 50.36 | 42.88 | 488.38 | 9.70 | 11.39 |
92
- | | top_p = 0.4| 68.30 | 53.58 | 544.53 | 7.97 | 10.16 |
93
- | num_layers = 24<br >num_attention_heads = 16<br >hidden_size = 1024<br >(gpt2-medium)| top_k = 1| 109.86 | 76.88 | 936.02 | 8.52 | 12.18 |
94
- | | top_k = 4| 109.69 | 78.70 | 943.71 | 8.60 | 11.99 |
95
- | | top_k = 8| 109.70 | 78.39 | 963.73 | 8.79 | 12.29 |
96
- | | top_k = 16| 111.18 | 79.05 | 945.27 | 8.50 | 11 .96|
97
- | | top_p = 0.4| 127.54 | 89.76 | 999.28 | 7.83 | 11.13 |
98
- | num_layers = 36<br >num_attention_heads = 20<br >hidden_size = 1280<br >(gpt2-large)| top_k = 1| 205.92 | 142.85| 1368.78 | 6.65 | 9.58 |
99
- | | top_k = 4| 205.43 | 140.40 | 1374.83 | 6.69 | 9.79 |
100
- | | top_k = 8| 205.62 | 139.47 | 1406.42 | 6.84 | 10.08 |
101
- | | top_k = 16| 205.16 | 139.77 | 1392.37 | 6.79 | 9.96 |
102
- | | top_p = 0.4| 221.06 | 152.35 | 1452.07 | 6.57 | 9.53 |
88
+ | num_layers = 12<br >num_attention_heads = 12<br >hidden_size = 768<br >(gpt2)| top_k = 1| 50.84 | 40.37 | 399.58 | 7.86 | 9.9 |
89
+ | | top_k = 4| 50.38 | 38.81 | 419.55 | 8.33 | 10.81 |
90
+ | | top_k = 8| 51.23 | 36.78 | 411.7 | 8.04 | 11.19 |
91
+ | | top_k = 16| 51.03 | 38.76 | 408.36 | 8.0 | 10.54 |
92
+ | | top_p = 0.4| 68.55 | 48.04 | 489.45 | 7.14 | 10.19 |
93
+ | num_layers = 24<br >num_attention_heads = 16<br >hidden_size = 1024<br >(gpt2-medium)| top_k = 1| 111.37 | 79.73 | 753.11 | 6.76 | 9.45 |
94
+ | | top_k = 4| 110.53 | 80.48 | 767.48 | 6.94 | 9.54 |
95
+ | | top_k = 8| 109.87 | 78.92 | 754.99 | 6.87 | 9.57 |
96
+ | | top_k = 16| 110.61 | 85.26 | 764.16 | 6.91 | 8 .96|
97
+ | | top_p = 0.4| 127.51 | 87.72 | 830.24 | 6.51 | 9.46 |
98
+ | num_layers = 36<br >num_attention_heads = 20<br >hidden_size = 1280<br >(gpt2-large)| top_k = 1| 203.76 | 142.85| 1108.26 | 5.44 | 7.76 |
99
+ | | top_k = 4| 204.18 | 139.49 | 1230.63 | 6.03 | 8.82 |
100
+ | | top_k = 8| 204.22 | 139.14 | 1238.96 | 6.07 | 8.9 |
101
+ | | top_k = 16| 204.11 | 140.04 | 1148.05 | 5.62 | 8.2 |
102
+ | | top_p = 0.4| 222.12 | 150.68 | 1248.75 | 5.62 | 8.29 |
103
103
104
104
105
105
## 测试方法
0 commit comments