@@ -31,7 +31,6 @@ using namespace nvcuda;
31
31
// Support A and B matrix with row-major inorder to compare with the kernels using CUDA Cores in
32
32
// hgemm.cu and hgemm_async.cu.
33
33
34
-
35
34
HOST_DEVICE_INLINE
36
35
int div_ceil(int a, int b) { return (a % b != 0 ) ? (a / b + 1 ) : (a / b); }
37
36
@@ -41,7 +40,7 @@ int div_ceil(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
41
40
// 共享内存,调用kernel时 需要指定动态共享内存大小,且smem的寻址
42
41
// 方式需要按照一维数组来使用 2. 提高L2 Cache的局部性(Thread
43
42
// Block Swizzle): https://zhuanlan.zhihu.com/p/555339335
44
- // 3. nedd __launch_bounds__ to avoid error 'too many resources required for launch'
43
+ // 3. __launch_bounds__: avoid error 'too many resources required for launch'
45
44
// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
46
45
template <const int WMMA_M=16 , const int WMMA_N=16 , const int WMMA_K=16 ,
47
46
const int WMMA_TILE_M=4 , const int WMMA_TILE_N=2 ,
@@ -257,7 +256,7 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_kernel(
257
256
// 共享内存,调用kernel时 需要指定动态共享内存大小,且smem的寻址
258
257
// 方式需要按照一维数组来使用 2. 提高L2 Cache的局部性(Thread
259
258
// Block Swizzle): https://zhuanlan.zhihu.com/p/555339335
260
- // 3. nedd __launch_bounds__ to avoid error 'too many resources required for launch'
259
+ // 3. __launch_bounds__: avoid error 'too many resources required for launch'
261
260
// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
262
261
template <const int WMMA_M=16 , const int WMMA_N=16 , const int WMMA_K=16 ,
263
262
const int WMMA_TILE_M=4 , const int WMMA_TILE_N=2 ,
@@ -476,7 +475,7 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel(
476
475
}
477
476
478
477
// stage with 256x256 block, dynamic smem
479
- // nedd __launch_bounds__ to avoid error 'too many resources required for launch'
478
+ // __launch_bounds__: avoid error 'too many resources required for launch'
480
479
// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
481
480
template <const int WMMA_M=16 , const int WMMA_N=16 , const int WMMA_K=16 ,
482
481
const int WMMA_TILE_M=4 , const int WMMA_TILE_N=4 ,
0 commit comments