Skip to content

Commit c00e421

Browse files
committed
Docs: FMA CUDA throughput
1 parent ce1e3b7 commit c00e421

File tree

2 files changed

+21
-1
lines changed

2 files changed

+21
-1
lines changed

.vscode/settings.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"accum",
66
"Adelstein",
77
"Andreas",
8+
"APSP",
89
"ashvardanian",
910
"ASIO",
1011
"asynchrony",
@@ -31,6 +32,7 @@
3132
"CTRE",
3233
"cublas",
3334
"CUDA",
35+
"cuobjdump",
3436
"denormal",
3537
"DOTPROD",
3638
"DPDK",
@@ -77,6 +79,7 @@
7779
"MSVC",
7880
"Müller",
7981
"multishot",
82+
"Needleman",
8083
"Neoverse",
8184
"Niebler",
8285
"Niels",
@@ -97,8 +100,10 @@
97100
"prefetcher",
98101
"pthread",
99102
"PTXAS",
103+
"quadpair",
100104
"RDMA",
101105
"reorderable",
106+
"semiring",
102107
"Shankhdhar",
103108
"simdjson",
104109
"sinf",
@@ -115,6 +120,7 @@
115120
"Threadblock",
116121
"TMUL",
117122
"Trettner",
123+
"uchar",
118124
"Unbundling",
119125
"Unif",
120126
"unifex",
@@ -125,10 +131,13 @@
125131
"vfmadd",
126132
"VNNI",
127133
"VPCLMULQDQ",
134+
"WarpGroup",
135+
"Warshall",
128136
"Weis",
129137
"WGMMA",
130138
"wmma",
131139
"Worklog",
140+
"Wunsch",
132141
"XCOMP",
133142
"XFEATURE",
134143
"XTILE",

less_slow.cu

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,17 @@ __global__ void tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel() {
222222
}
223223

224224
/**
225+
* With those instructions we can expect the following throughput on H200:
226+
*
227+
* - `f64` FMA: 4.5 T
228+
* - `i64` FMA: 3.1 T
229+
* - `f32` FMA: 22 T
230+
* - `i32` FMA: 15.5 T so we should always prefer 32-bit ops
231+
* - `u8u32` DP4A: 39.3 T
232+
* - `u24u32` UMUL: 13.4 T not really better than `i32` FMA
233+
* - `f16` FMA: 12.2 T on Volta
234+
* - `bf16` FMA: 12.2 T on Ampere
235+
*
225236
* Given the growing demand for such workloads, new Dynamic Programming
226237
* eXtensions @b (DPX) have been added on Hopper for various combinations
227238
* of { addition, min, max, ReLU } on 8-bit and 16-bit integer inputs.
@@ -288,7 +299,7 @@ __global__ void tops_i32i32_sm90dpx_16x16x16_loop128_smith_waterman_cuda_kernel(
288299
/**
289300
* On H200, the following integer performance can be expected:
290301
*
291-
* - Naive FMA for `i32` and `i64` inputs: 2.3 P
302+
* - Naive FMA for `i32` and `i64` inputs: 3.1 T and 15.5 T
292303
* - Hopper DPX for Floyd-Warshall algorithm with `u16` and `u32`: 11 T
293304
* - Hopper DPX for Needleman-Wunsch algorithm with `i16` and `i32`: 11 T
294305
* - Hopper DPX for Smith-Waterman algorithm with `i32`: 27 T

0 commit comments

Comments
 (0)