|
153 | 153 | ## 0x01 📖 CUDA Kernel目录 (面试常考题目)
|
154 | 154 | <div id="kernellist"></div>
|
155 | 155 |
|
| 156 | +- / = not supported now. |
156 | 157 | - ✔️ = known work and already supported now.
|
157 | 158 | - ❔ = in my plan, but not coming soon, maybe a few weeks later.
|
158 |
| - |
159 |
| -|📖 cuda kernel| 📖 element dtype| 📖 accumulate dtype| |
160 |
| -|:---|:---|:---| |
161 |
| -| ✔️ [sgemm_sliced_k_f32_kernel](./sgemm/sgemm.cu)|f32|f32| |
162 |
| -| ✔️ [sgemm_t_tile_sliced_k_f32x4_kernel](./sgemm/sgemm.cu)|f32|f32| |
163 |
| -| ❔ [hgemm_sliced_k_f16_f32_kernel](./sgemm/sgemm.cu)|f16|f32| |
164 |
| -| ❔ [hgemm_t_tile_sliced_k_f16x2_f32_kernel](./sgemm/sgemm.cu)|f16|f32| |
165 |
| -| ✔️ [sgemv_k32_f32_kernel](./sgemv/sgemv.cu)|f32|f32| |
166 |
| -| ✔️ [sgemv_k128_f32x4_kernel](./sgemv/sgemv.cu)|f32|f32| |
167 |
| -| ✔️ [sgemv_k16_f32_kernel](./sgemv/sgemv.cu)|f32|f32| |
168 |
| -| ❔ [hgemv_k32_f16_kernel](./sgemv/sgemv.cu)|f16|f16| |
169 |
| -| ❔ [hgemv_k128_f16x2_kernel](./sgemv/sgemv.cu)|f16|f16| |
170 |
| -| ❔ [hgemv_k16_f16_kernel](./sgemv/sgemv.cu)|f16|f16| |
171 |
| -| ✔️ [warp_reduce_f32/f16/bf16_kernel](./reduce/block_all_reduce.cu)|f16/bf16/f32|f16/bf16/f32| |
172 |
| -| ✔️ [block_reduce_f32_kernel](./reduce/block_all_reduce.cu)|f32|f32| |
173 |
| -| ✔️ [block_all_reduce_sum_f32_f32_kernel](./reduce/block_all_reduce.cu)|f32|f32| |
174 |
| -| ✔️ [block_all_reduce_sum_f32x4_f32_kernel](./reduce/block_all_reduce.cu)|f32|f32| |
175 |
| -| ✔️ [block_all_reduce_sum_f16_f16_kernel](./reduce/block_all_reduce.cu)|f16|f16| |
176 |
| -| ✔️ [block_all_reduce_sum_f16_f32_kernel](./reduce/block_all_reduce.cu)|f16|f32| |
177 |
| -| ✔️ [block_all_reduce_sum_f16x2_f16_kernel](./reduce/block_all_reduce.cu)|f16|f16| |
178 |
| -| ✔️ [block_all_reduce_sum_f16x2_f32_kernel](./reduce/block_all_reduce.cu)|f16|f32| |
179 |
| -| ✔️ [block_all_reduce_sum_bf16_bf16_kernel](./reduce/block_all_reduce.cu)|bf16|bf16| |
180 |
| -| ✔️ [block_all_reduce_sum_bf16_f32_kernel](./reduce/block_all_reduce.cu)|bf16|f32| |
181 |
| -| ✔️ [block_all_reduce_sum_bf16x2_bf16_kernel](./reduce/block_all_reduce.cu)|bf16|bf16| |
182 |
| -| ✔️ [block_all_reduce_sum_bf16x2_f32_kernel](./reduce/block_all_reduce.cu)|bf16|f32| |
183 |
| -| ✔️ [block_all_reduce_sum_fp8_e4m3_f16_kernel](./reduce/block_all_reduce.cu)|fp8_e4m3|f16| |
184 |
| -| ❔ [block_all_reduce_sum_i8_i32_kernel](./reduce/block_all_reduce.cu)|i8|i32| |
185 |
| -| ✔️ [dot_product_f32_kernel](./dot-product/dot_product.cu)|f32|f32| |
186 |
| -| ✔️ [dot_product_f32x4_kernel](./dot-product/dot_product.cu)|f32|f32| |
187 |
| -| ❔ [dot_product_f16_f16_kernel](./dot-product/dot_product.cu)|f16|f16| |
188 |
| -| ❔ [dot_product_f16x2_f16_kernel](./dot-product/dot_product.cu)|f16|f16| |
189 |
| -| ❔ [dot_product_f16_f32_kernel](./dot-product/dot_product.cu)|f16|f32| |
190 |
| -| ❔ [dot_product_f16x2_f32_kernel](./dot-product/dot_product.cu)|f16|f32| |
191 |
| -| ✔️ [elementwise_f32_kernel](./elementwise/elementwise.cu)|f32|-| |
192 |
| -| ✔️ [elementwise_f32x4_kernel](./elementwise/elementwise.cu)|f32|-| |
193 |
| -| ❔ [elementwise_f16_kernel](./elementwise/elementwise.cu)|f16|-| |
194 |
| -| ❔ [elementwise_f16x2_kernel](./elementwise/elementwise.cu)|f16|-| |
195 |
| -| ✔️ [histogram_i32_kernel](./histogram/histogram.cu)|i32|-| |
196 |
| -| ✔️ [histogram_i32x4_kernel](./histogram/histogram.cu)|i32|-| |
197 |
| -| ✔️ [softmax_f32_kernel (grid level memory fence)](./softmax/softmax.cu)|f32|f32| |
198 |
| -| ✔️ [softmax_f32x4_kernel (grid level memory fence)](./softmax/softmax.cu)|f32|f32| |
199 |
| -| ❔ [softmax_f32x4_kernel (per token)](./softmax/softmax.cu)|f32|f32| |
200 |
| -| ❔ [safe_softmax_f32x4_kernel (per token)](./softmax/softmax.cu)|f32|f32| |
201 |
| -| ✔️ [sigmoid_f32_kernel](./sigmoid/sigmoid.cu)|f32|-| |
202 |
| -| ✔️ [sigmoid_f32x4_kernel](./sigmoid/sigmoid.cu)|f32|-| |
203 |
| -| ✔️ [relu_f32_kernel](./relu/relu.cu)|f32|-| |
204 |
| -| ✔️ [relu_f32x4_kernel](./relu/relu.cu)|f32|-| |
205 |
| -| ❔ [relu_f16_kernel](./relu/relu.cu)|f16|-| |
206 |
| -| ❔ [relu_f16x2_kernel](./relu/relu.cu)|f16|-| |
207 |
| -| ✔️ [layer_norm_f32_kernel (per token)](./layer-norm/layer_norm.cu)|f32|f32| |
208 |
| -| ✔️ [layer_norm_f32x4_kernel (per token)](./layer-norm/layer_norm.cu)|f32|f32| |
209 |
| -| ❔ [layer_norm_f16_kernel (per token)](./layer-norm/layer_norm.cu)|f16|f16| |
210 |
| -| ❔ [layer_norm_f16x2_kernel (per token)](./layer-norm/layer_norm.cu)|f16|f16| |
211 |
| -| ✔️ [rms_norm_f32_kernel (per token)](./rms-norm/rms_norm.cu)|f32|f32| |
212 |
| -| ✔️ [rms_norm_f32x4_kernel (per token)](./rms-norm/rms_norm.cu)|f32|f32| |
213 |
| -| ❔ [rms_norm_f16_kernel (per token)](./rms-norm/rms_norm.cu)|f16|f16| |
214 |
| -| ❔ [rms_norm_f16x2_kernel (per token)](./rms-norm/rms_norm.cu)|f16|f16| |
215 |
| -| ✔️ [flash_attn_1_fwd_f32_kernel](./flash-attn/flash_attn_1_fwd_f32.cu)|f32|f32| |
216 |
| -| ❔ [flash_attn_2_fwd_f32_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|f32|f32| |
217 |
| -| ❔ [flash_attn_2_fwd_f16_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|f16|f32| |
218 |
| -| ❔ [flash_attn_2_fwd_bf16_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|bf16|f32| |
219 |
| -| ✔️ [hard_nms cpp only](./nms/nms.cc)|f32|-| |
220 |
| -| ✔️ [notes v1(deprecated)](./notes-v1.cu)|f32|f32| |
| 159 | +- **workflow**: custom **CUDA** kernel impl -> **Torch** python binding -> Run tests. |
| 160 | + |
| 161 | +|📖 cuda kernel| 📖 elem dtype| 📖 acc dtype| 📖 docs | |
| 162 | +|:---|:---|:---|:---| |
| 163 | +| ✔️ [sgemm_sliced_k_f32_kernel](./sgemm/sgemm.cu)|f32|f32|❔| |
| 164 | +| ✔️ [sgemm_t_tile_sliced_k_f32x4_kernel](./sgemm/sgemm.cu)|f32|f32|❔| |
| 165 | +| ❔ [hgemm_sliced_k_f16_f32_kernel](./sgemm/sgemm.cu)|f16|f32|❔| |
| 166 | +| ❔ [hgemm_t_tile_sliced_k_f16x2_f32_kernel](./sgemm/sgemm.cu)|f16|f32|❔| |
| 167 | +| ✔️ [sgemv_k32_f32_kernel](./sgemv/sgemv.cu)|f32|f32|❔| |
| 168 | +| ✔️ [sgemv_k128_f32x4_kernel](./sgemv/sgemv.cu)|f32|f32|❔| |
| 169 | +| ✔️ [sgemv_k16_f32_kernel](./sgemv/sgemv.cu)|f32|f32|❔| |
| 170 | +| ❔ [hgemv_k32_f16_kernel](./sgemv/sgemv.cu)|f16|f16|❔| |
| 171 | +| ❔ [hgemv_k128_f16x2_kernel](./sgemv/sgemv.cu)|f16|f16|❔| |
| 172 | +| ❔ [hgemv_k16_f16_kernel](./sgemv/sgemv.cu)|f16|f16|❔| |
| 173 | +| ✔️ [warp_reduce_f32/f16/bf16_kernel](./reduce/block_all_reduce.cu)|f16/bf16/f32|f16/bf16/f32|[link](./reduce/)| |
| 174 | +| ✔️ [block_reduce_f32_kernel](./reduce/block_all_reduce.cu)|f32|f32|[link](./reduce/)| |
| 175 | +| ✔️ [block_all_reduce_sum_f32_f32_kernel](./reduce/block_all_reduce.cu)|f32|f32|[link](./reduce/)| |
| 176 | +| ✔️ [block_all_reduce_sum_f32x4_f32_kernel](./reduce/block_all_reduce.cu)|f32|f32|[link](./reduce/)| |
| 177 | +| ✔️ [block_all_reduce_sum_f16_f16_kernel](./reduce/block_all_reduce.cu)|f16|f16|[link](./reduce/)| |
| 178 | +| ✔️ [block_all_reduce_sum_f16_f32_kernel](./reduce/block_all_reduce.cu)|f16|f32|[link](./reduce/)| |
| 179 | +| ✔️ [block_all_reduce_sum_f16x2_f16_kernel](./reduce/block_all_reduce.cu)|f16|f16|[link](./reduce/)| |
| 180 | +| ✔️ [block_all_reduce_sum_f16x2_f32_kernel](./reduce/block_all_reduce.cu)|f16|f32|[link](./reduce/)| |
| 181 | +| ✔️ [block_all_reduce_sum_bf16_bf16_kernel](./reduce/block_all_reduce.cu)|bf16|bf16|[link](./reduce/)| |
| 182 | +| ✔️ [block_all_reduce_sum_bf16_f32_kernel](./reduce/block_all_reduce.cu)|bf16|f32|[link](./reduce/)| |
| 183 | +| ✔️ [block_all_reduce_sum_bf16x2_bf16_kernel](./reduce/block_all_reduce.cu)|bf16|bf16|[link](./reduce/)| |
| 184 | +| ✔️ [block_all_reduce_sum_bf16x2_f32_kernel](./reduce/block_all_reduce.cu)|bf16|f32|[link](./reduce/)| |
| 185 | +| ✔️ [block_all_reduce_sum_fp8_e4m3_f16_kernel](./reduce/block_all_reduce.cu)|fp8_e4m3|f16|[link](./reduce/)| |
| 186 | +| ❔ [block_all_reduce_sum_i8_i32_kernel](./reduce/block_all_reduce.cu)|i8|i32|[link](./reduce/)| |
| 187 | +| ✔️ [dot_product_f32_kernel](./dot-product/dot_product.cu)|f32|f32|❔| |
| 188 | +| ✔️ [dot_product_f32x4_kernel](./dot-product/dot_product.cu)|f32|f32|❔| |
| 189 | +| ❔ [dot_product_f16_f16_kernel](./dot-product/dot_product.cu)|f16|f16|❔| |
| 190 | +| ❔ [dot_product_f16x2_f16_kernel](./dot-product/dot_product.cu)|f16|f16|❔| |
| 191 | +| ❔ [dot_product_f16_f32_kernel](./dot-product/dot_product.cu)|f16|f32|/|❔| |
| 192 | +| ❔ [dot_product_f16x2_f32_kernel](./dot-product/dot_product.cu)|f16|f32|/|❔| |
| 193 | +| ✔️ [elementwise_f32_kernel](./elementwise/elementwise.cu)|f32|/|/|❔| |
| 194 | +| ✔️ [elementwise_f32x4_kernel](./elementwise/elementwise.cu)|f32|/|/|❔| |
| 195 | +| ❔ [elementwise_f16_kernel](./elementwise/elementwise.cu)|f16|/|/|❔| |
| 196 | +| ❔ [elementwise_f16x2_kernel](./elementwise/elementwise.cu)|f16|/|/|❔| |
| 197 | +| ✔️ [histogram_i32_kernel](./histogram/histogram.cu)|i32|/|/|❔| |
| 198 | +| ✔️ [histogram_i32x4_kernel](./histogram/histogram.cu)|i32|/|/|❔| |
| 199 | +| ✔️ [softmax_f32_kernel (grid level memory fence)](./softmax/softmax.cu)|f32|f32|❔| |
| 200 | +| ✔️ [softmax_f32x4_kernel (grid level memory fence)](./softmax/softmax.cu)|f32|f32|❔| |
| 201 | +| ❔ [softmax_f32x4_kernel (per token)](./softmax/softmax.cu)|f32|f32|❔| |
| 202 | +| ❔ [safe_softmax_f32x4_kernel (per token)](./softmax/softmax.cu)|f32|f32|❔| |
| 203 | +| ✔️ [sigmoid_f32_kernel](./sigmoid/sigmoid.cu)|f32|/|❔| |
| 204 | +| ✔️ [sigmoid_f32x4_kernel](./sigmoid/sigmoid.cu)|f32|/|❔| |
| 205 | +| ✔️ [relu_f32_kernel](./relu/relu.cu)|f32|/|❔| |
| 206 | +| ✔️ [relu_f32x4_kernel](./relu/relu.cu)|f32|/|❔| |
| 207 | +| ❔ [relu_f16_kernel](./relu/relu.cu)|f16|/|❔| |
| 208 | +| ❔ [relu_f16x2_kernel](./relu/relu.cu)|f16|/|❔| |
| 209 | +| ✔️ [layer_norm_f32_kernel (per token)](./layer-norm/layer_norm.cu)|f32|f32|❔| |
| 210 | +| ✔️ [layer_norm_f32x4_kernel (per token)](./layer-norm/layer_norm.cu)|f32|f32|❔| |
| 211 | +| ❔ [layer_norm_f16_kernel (per token)](./layer-norm/layer_norm.cu)|f16|f16|❔| |
| 212 | +| ❔ [layer_norm_f16x2_kernel (per token)](./layer-norm/layer_norm.cu)|f16|f16|❔| |
| 213 | +| ✔️ [rms_norm_f32_kernel (per token)](./rms-norm/rms_norm.cu)|f32|f32|❔| |
| 214 | +| ✔️ [rms_norm_f32x4_kernel (per token)](./rms-norm/rms_norm.cu)|f32|f32|❔| |
| 215 | +| ❔ [rms_norm_f16_kernel (per token)](./rms-norm/rms_norm.cu)|f16|f16|❔| |
| 216 | +| ❔ [rms_norm_f16x2_kernel (per token)](./rms-norm/rms_norm.cu)|f16|f16|❔| |
| 217 | +| ✔️ [flash_attn_1_fwd_f32_kernel](./flash-attn/flash_attn_1_fwd_f32.cu)|f32|f32|[link](./flash-attn)| |
| 218 | +| ❔ [flash_attn_2_fwd_f32_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|f32|f32|[link](./flash-attn)| |
| 219 | +| ❔ [flash_attn_2_fwd_f16_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|f16|f32|[link](./flash-attn)| |
| 220 | +| ❔ [flash_attn_2_fwd_bf16_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|bf16|f32|[link](./flash-attn)| |
| 221 | +| ✔️ [hard_nms cpp only](./nms/nms.cc)|f32|/|❔| |
| 222 | +| ✔️ [notes v1(deprecated)](./notes-v1.cu)|f32|f32|/| |
221 | 223 |
|
222 | 224 | ## ©️License
|
223 | 225 | GNU General Public License v3.0
|
|
0 commit comments