33## 📚 build bin
44
55``` bash
6- make
6+ make # build all default binaries
77```
88
99## 📚 ncu profile
1010
11- Achieve 0 bank conflicts for LDSM via smem swizzle.
11+ - 📚 Achieve 0 bank conflicts for LDSM via smem swizzle.
1212
1313``` bash
1414ncu --metrics l1tex__data_bank_reads ./mat_trans_swizzle.bin
@@ -20,7 +20,7 @@ ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld ./hgemm_mma_s
2020ncu --metrics sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm ./hgemm_mma_swizzle.bin 1024 1024 1024 0 1
2121```
2222
23- log: (achieve 0 bank conflicts for LDSM via smem swizzle)
23+ - 📚 log: (achieve 0 bank conflicts for LDSM via smem swizzle)
2424
2525``` bash
2626ncu --metrics sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm ./hgemm_mma_swizzle.bin 1024 1024 1024 0 1
@@ -72,16 +72,10 @@ ncu --metrics sm__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldsm ./h
7272
7373## 📚 performance
7474
75- - NVIDIA TRX 3080 Laptop
75+ - 📚 NVIDIA RTX 3080 Laptop
7676``` bash
7777 ./hgemm_mma_swizzle.bin 4096 4096 4096 1 10
7878
79- ALGO = HGEMM MMA NAIVE
80- M N K = 4096 4096 4096, W = 1, R = 10, Time = 0.02986609 s, AVG Performance = 4.6018 Tflops
81-
82- ALGO = HGEMM MMA NAIVE + SMEM SWIZZLE
83- M N K = 4096 4096 4096, W = 1, R = 10, Time = 0.02860964 s, AVG Performance = 4.8039 Tflops
84-
8579ALGO = HGEMM mma2x4_warp4x4
8680M N K = 4096 4096 4096, W = 1, R = 10, Time = 0.00392888 s, AVG Performance = 34.9817 Tflops
8781
@@ -92,7 +86,7 @@ M N K = 4096 4096 4096, W = 1, R = 10, Time = 0.00234496 s, AVG Performa
9286
9387## 📚 print swizzle layout
9488
95- - M16K16
89+ - 📚 M16K16
9690
9791``` bash
9892python3 print_swizzle_layout.py --logical-col 64 --show-logical-col
@@ -147,7 +141,7 @@ smem col 0~16, step 8-
147141----------------------
148142```
149143
150- - M16K64
144+ - 📚 M16K64 (Zigzag)
151145
152146``` bash
153147python3 print_swizzle_layout.py --logical-col 64 --show-logical-col
0 commit comments