82
82
# NVIDIA L20
83
83
ALGO = MMA16816 HGEMM NN MMA=2x4 WARP=4x4x2 STAGES=2 BLOCK SWIZZLE=2048
84
84
M N K = 12544 12544 12544, Time = 0.03445555 0.03446098 0.03447399 s, AVG Performance = 114.5541 Tflops
85
- M N K = 12800 12800 12800, Time = 0.03651175 0.03652291 0.03653325 s, AVG Performance = 114.8404 Tflops
86
- M N K = 13056 13056 13056, Time = 0.03893658 0.03893934 0.03894375 s, AVG Performance = 114.3067 Tflops
87
- M N K = 13312 13312 13312, Time = 0.04108800 0.04109589 0.04111155 s, AVG Performance = 114.8052 Tflops
88
- M N K = 13568 13568 13568, Time = 0.04365005 0.04365251 0.04365619 s, AVG Performance = 114.4375 Tflops
89
- M N K = 13824 13824 13824, Time = 0.04591821 0.04593121 0.04594585 s, AVG Performance = 115.0332 Tflops
90
- M N K = 14080 14080 14080, Time = 0.04861338 0.04861614 0.04862054 s, AVG Performance = 114.8306 Tflops
91
- M N K = 14336 14336 14336, Time = 0.05134848 0.05135278 0.05136691 s, AVG Performance = 114.7493 Tflops
92
- M N K = 14592 14592 14592, Time = 0.05417882 0.05418947 0.05421568 s, AVG Performance = 114.6726 Tflops
93
- M N K = 14848 14848 14848, Time = 0.05706547 0.05706916 0.05707469 s, AVG Performance = 114.7182 Tflops
94
- M N K = 15104 15104 15104, Time = 0.06001767 0.06002084 0.06002586 s, AVG Performance = 114.8164 Tflops
95
85
M N K = 15360 15360 15360, Time = 0.06307226 0.06307789 0.06308864 s, AVG Performance = 114.9017 Tflops
96
86
M N K = 15616 15616 15616, Time = 0.06612480 0.06612798 0.06613094 s, AVG Performance = 115.1739 Tflops
97
87
M N K = 15872 15872 15872, Time = 0.06969549 0.06970215 0.06971290 s, AVG Performance = 114.7305 Tflops
@@ -102,16 +92,6 @@ M N K = 16384 16384 16384, Time = 0.07663001 0.07663534 0.07664947 s, A
102
92
# NVIDIA L20
103
93
ALGO = CuTe HGEMM, TN, STAGES=2, SMEM SWIZZLE=< 3, 3, 3> , BLOCK SWIZZLE=2048
104
94
M N K = 12544 12544 12544, Time = 0.03413504 0.03414354 0.03415450 s, AVG Performance = 115.6191 Tflops
105
- M N K = 12800 12800 12800, Time = 0.03615642 0.03616481 0.03617178 s, AVG Performance = 115.9775 Tflops
106
- M N K = 13056 13056 13056, Time = 0.03821158 0.03821455 0.03821671 s, AVG Performance = 116.4747 Tflops
107
- M N K = 13312 13312 13312, Time = 0.04033536 0.04033894 0.04034560 s, AVG Performance = 116.9595 Tflops
108
- M N K = 13568 13568 13568, Time = 0.04318720 0.04319130 0.04319949 s, AVG Performance = 115.6595 Tflops
109
- M N K = 13824 13824 13824, Time = 0.04541542 0.04541942 0.04542157 s, AVG Performance = 116.3294 Tflops
110
- M N K = 14080 14080 14080, Time = 0.04770918 0.04772137 0.04772761 s, AVG Performance = 116.9836 Tflops
111
- M N K = 14336 14336 14336, Time = 0.05077402 0.05077955 0.05078426 s, AVG Performance = 116.0447 Tflops
112
- M N K = 14592 14592 14592, Time = 0.05324902 0.05326633 0.05327872 s, AVG Performance = 116.6599 Tflops
113
- M N K = 14848 14848 14848, Time = 0.05638758 0.05640591 0.05643162 s, AVG Performance = 116.0671 Tflops
114
- M N K = 15104 15104 15104, Time = 0.05892505 0.05893622 0.05894246 s, AVG Performance = 116.9294 Tflops
115
95
M N K = 15360 15360 15360, Time = 0.06227354 0.06228111 0.06228992 s, AVG Performance = 116.3717 Tflops
116
96
M N K = 15616 15616 15616, Time = 0.06492467 0.06493727 0.06496666 s, AVG Performance = 117.2858 Tflops
117
97
M N K = 15872 15872 15872, Time = 0.06843085 0.06843873 0.06844723 s, AVG Performance = 116.8485 Tflops
@@ -122,16 +102,6 @@ M N K = 16384 16384 16384, Time = 0.07564493 0.07565752 0.07567462 s, A
122
102
# NVIDIA L20
123
103
ALGO = cuBLAS CUBLAS_GEMM_DEFAULT_TENSOR_OP TN
124
104
M N K = 12544 12544 12544, Time = 0.03472691 0.03472968 0.03473408 s, AVG Performance = 113.6678 Tflops
125
- M N K = 12800 12800 12800, Time = 0.03687321 0.03687834 0.03688038 s, AVG Performance = 113.7335 Tflops
126
- M N K = 13056 13056 13056, Time = 0.03909427 0.03910103 0.03910963 s, AVG Performance = 113.8341 Tflops
127
- M N K = 13312 13312 13312, Time = 0.04140135 0.04141281 0.04148429 s, AVG Performance = 113.9266 Tflops
128
- M N K = 13568 13568 13568, Time = 0.04382720 0.04383375 0.04384461 s, AVG Performance = 113.9643 Tflops
129
- M N K = 13824 13824 13824, Time = 0.04629504 0.04630118 0.04630733 s, AVG Performance = 114.1140 Tflops
130
- M N K = 14080 14080 14080, Time = 0.04889805 0.04891136 0.04898202 s, AVG Performance = 114.1375 Tflops
131
- M N K = 14336 14336 14336, Time = 0.05156966 0.05157878 0.05158503 s, AVG Performance = 114.2465 Tflops
132
- M N K = 14592 14592 14592, Time = 0.05437849 0.05439980 0.05445734 s, AVG Performance = 114.2292 Tflops
133
- M N K = 14848 14848 14848, Time = 0.05723853 0.05725573 0.05730202 s, AVG Performance = 114.3444 Tflops
134
- M N K = 15104 15104 15104, Time = 0.06022963 0.06024274 0.06032179 s, AVG Performance = 114.3935 Tflops
135
105
M N K = 15360 15360 15360, Time = 0.06332416 0.06333143 0.06334157 s, AVG Performance = 114.4417 Tflops
136
106
M N K = 15616 15616 15616, Time = 0.06649446 0.06650184 0.06651699 s, AVG Performance = 114.5264 Tflops
137
107
M N K = 15872 15872 15872, Time = 0.06977024 0.06977659 0.06978355 s, AVG Performance = 114.6081 Tflops
@@ -151,50 +121,33 @@ M N K = 16384 16384 16384, Time = 0.07668429 0.07669371 0.07670784 s, A
151
121
<!-- -
152
122

153
123

124
+ 
154
125
155
126
--->
156
- ![ NVIDIA_L20_NN+TN] ( https://github.com/user-attachments/assets/89bac543-7272-44cd-b616-54df8ca23a91 )
157
127
128
+ ![ NVIDIA_L20_NN+TN+v2] ( https://github.com/user-attachments/assets/71927ac9-72b3-4ce9-b0e2-788b5885bc99 )
158
129
159
130
- WMMA: Up to 113.76 TFLOPS, 113.83/119.5=95.25% TFLOPS utilization, 113.83/116.25=97.91% cuBLAS performance.
160
131
- MMA: Up to 115.12 TFLOPS, 115.12/119.5=96.33% TFLOPS utilization, 115.12/116.25=99.03% cuBLAS performance.
161
-
162
- ``` bash
163
- python3 hgemm.py --M 16384 --N 16384 --K 8192 --mma-all --wmma-all --cuda-all
164
- ----------------------------------------------------------------------------------------------------------------------------------
165
- M=16384, N=16384, K=8192, Warmup=2, Iters=10, 1/1
166
- ----------------------------------------------------------------------------------------------------------------------------------
167
- (naive): [' -236.75 ' , ' 176.0 ' ], time:1835.537ms, swizzle: NOOP, TFLOPS: 2.40 (+0.00%)
168
- (f16x8pack+t8x8+bcf): [' -236.75 ' , ' 176.0 ' ], time:99.63080ms, swizzle: NOOP, TFLOPS: 44.14 (+1742.34%)
169
- (f16x8pack+t8x8+k16+dbuf): [' -236.75 ' , ' 176.0 ' ], time:98.20067ms, swizzle: NOOP, TFLOPS: 44.79 (+1.46%)
170
- --------------------------------------------------------------------WMMA----------------------------------------------------------
171
- (wmma4x2+warp2x4): [' -234.0 ' , ' 181.0 ' ], time:55.99505ms, swizzle: NOOP, TFLOPS: 78.54 (+75.37%)
172
- (wmma4x2+warp2x4+stage3): [' -234.0 ' , ' 181.0 ' ], time:49.62856ms, swizzle: NOOP, TFLOPS: 88.62 (+12.83%)
173
- (wmma4x2+warp2x4+stage3+dsmem): [' -234.0 ' , ' 181.0 ' ], time:49.62389ms, swizzle: NOOP, TFLOPS: 88.63 (+0.01%)
174
- (wmma4x2+warp2x4+stage3+swizzle): [' -234.0 ' , ' 181.0 ' ], time:39.11254ms, swizzle: 4096, TFLOPS: 112.45(+26.87%)
175
- (wmma4x2+warp2x4+stage2+swizzle): [' -234.0 ' , ' 181.0 ' ], time:38.63754ms, swizzle: 4096, TFLOPS: 113.83(+1.23%)
176
- --------------------------------------------------------------------MMA-----------------------------------------------------------
177
- (mma2x4+warp4x4+stage2+swizzle): [' -234.0 ' , ' 181.0 ' ], time:38.40544ms, swizzle: 4096, TFLOPS: 114.52(+0.60%)
178
- (mma2x4+warp4x4+stage2+dsmem+swizzle): [' -234.0 ' , ' 181.0 ' ], time:38.20540ms, swizzle: 4096, TFLOPS: 115.12(+0.52%)
179
- (cublas): [' -234.0 ' , ' 181.0 ' ], time:37.83144ms, swizzle: NOOP, TFLOPS: 116.25(+0.99%)
180
- ----------------------------------------------------------------------------------------------------------------------------------
181
- ```
132
+
182
133
全量MNK测试命令(提示: 每个MNK单独测试的性能数据更准确)
183
134
``` bash
184
- python3 hgemm.py --cute-tn --mma --plot --dir tmp --tag NN+TN --i 20 --wmma-all
135
+ python3 hgemm.py --cute-tn --mma --plot
185
136
```
186
137
187
138
### NVIDIA GeForce RTX 4090
188
139
在NVIDIA RTX 4090上(FP16 Tensor Cores算力为330 TFLOPS),WMMA(m16n16k16)性能表现比MMA(m16n8k16)要更好,大分部MNK下,本仓库的实现能达到cuBLAS 95%~ 99%的性能,某些case能超过cuBLAS。就本仓库的实现而言,在RTX 4090上,大规模矩阵乘(MNK>=8192),WMMA表现更优,小规模矩阵乘,MMA表现更优。
189
140
190
141
<!-- -
191
142

143
+ 
144
+
192
145
--->
193
146
194
- ![ NVIDIA_GeForce_RTX_4090_NN+TN] ( https://github.com/user-attachments/assets/d8d7380b-4271-41f6-964a-ac3fa81f7f4c )
147
+ ![ NVIDIA_GeForce_RTX_4090_NN+TN+v4 ] ( https://github.com/user-attachments/assets/05ef4f5e-d999-48ea-b58e-782cffb24e85 )
195
148
196
149
``` bash
197
- python3 hgemm.py --cute-tn --mma --plot --dir tmp --tag NN+TN --i 20 --wmma-all
150
+ python3 hgemm.py --cute-tn --mma --wmma-all --plot
198
151
```
199
152
200
153
### NVIDIA GeForce RTX 3080 Laptop
@@ -204,7 +157,7 @@ python3 hgemm.py --cute-tn --mma --plot --dir tmp --tag NN+TN --i 20 --wmma-all
204
157
![ ] ( ./NVIDIA_GeForce_RTX_3080_Laptop_GPU_WSL2.png )
205
158
206
159
``` bash
207
- python3 hgemm.py --wmma-all --plot --dir tmp
160
+ python3 hgemm.py --wmma-all --plot
208
161
```
209
162
210
163
0 commit comments