Skip to content

Commit 8c6922b

Browse files
authored
[SGEMM] Update SGEMM TF32 Benchmark (#87)
* Update README.md * Update hgemm_wmma_stage.cu * Update README.md * Update README.md * Update sgemm.py
1 parent c4db4f8 commit 8c6922b

File tree

3 files changed

+491
-339
lines changed

3 files changed

+491
-339
lines changed

hgemm/hgemm_wmma_stage.cu

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ using namespace nvcuda;
3131
// Support A and B matrix with row-major inorder to compare with the kernels using CUDA Cores in
3232
// hgemm.cu and hgemm_async.cu.
3333

34-
3534
HOST_DEVICE_INLINE
3635
int div_ceil(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
3736

@@ -41,7 +40,7 @@ int div_ceil(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
4140
// 共享内存,调用kernel时 需要指定动态共享内存大小,且smem的寻址
4241
// 方式需要按照一维数组来使用 2. 提高L2 Cache的局部性(Thread
4342
// Block Swizzle): https://zhuanlan.zhihu.com/p/555339335
44-
// 3. nedd __launch_bounds__ to avoid error 'too many resources required for launch'
43+
// 3. __launch_bounds__: avoid error 'too many resources required for launch'
4544
// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
4645
template<const int WMMA_M=16, const int WMMA_N=16, const int WMMA_K=16,
4746
const int WMMA_TILE_M=4, const int WMMA_TILE_N=2,
@@ -257,7 +256,7 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_kernel(
257256
// 共享内存,调用kernel时 需要指定动态共享内存大小,且smem的寻址
258257
// 方式需要按照一维数组来使用 2. 提高L2 Cache的局部性(Thread
259258
// Block Swizzle): https://zhuanlan.zhihu.com/p/555339335
260-
// 3. nedd __launch_bounds__ to avoid error 'too many resources required for launch'
259+
// 3. __launch_bounds__: avoid error 'too many resources required for launch'
261260
// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
262261
template<const int WMMA_M=16, const int WMMA_N=16, const int WMMA_K=16,
263262
const int WMMA_TILE_M=4, const int WMMA_TILE_N=2,
@@ -476,7 +475,7 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel(
476475
}
477476

478477
// stage with 256x256 block, dynamic smem
479-
// nedd __launch_bounds__ to avoid error 'too many resources required for launch'
478+
// __launch_bounds__: avoid error 'too many resources required for launch'
480479
// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
481480
template<const int WMMA_M=16, const int WMMA_N=16, const int WMMA_K=16,
482481
const int WMMA_TILE_M=4, const int WMMA_TILE_N=4,

0 commit comments

Comments
 (0)