@@ -179,9 +179,9 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
179179 print (" mD:\t " ); print (mD ); print (" \n " ); // mD: gmem_ptr[32b](GMEM_ADDR_D) o (512,1024):(1024,_1)
180180
181181 print (" gA:\t " ); print (gA ); print (" \n " ); // gA: gmem_ptr[16b](GMEM_ADDR_A + offset_for_mma_tile) o (_128,_64,4):(256,_1,_64)
182- print (" gB:\t " ); print (gB ); print (" \n " ); // gB: gmem_ptr[16b](GMEM_ADDR_B + offset_for_mma_tile) o (_256,_64,4):(_1, 256,16384 )
183- print (" gC:\t " ); print (gC ); print (" \n " ); // gC: gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile) o (_128,_256):(256 ,_1)
184- print (" gD:\t " ); print (gD ); print (" \n " ); // gD: gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile) o (_128,_256):(256 ,_1)
182+ print (" gB:\t " ); print (gB ); print (" \n " ); // gB: gmem_ptr[16b](GMEM_ADDR_B + offset_for_mma_tile) o (_256,_64,4):(256,_1,_64 )
183+ print (" gC:\t " ); print (gC ); print (" \n " ); // gC: gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile) o (_128,_256):(1024 ,_1)
184+ print (" gD:\t " ); print (gD ); print (" \n " ); // gD: gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile) o (_128,_256):(1024 ,_1)
185185 } __syncthreads ();
186186
187187 // The SMEM tensors
@@ -209,9 +209,9 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
209209
210210 if (thread0 ()) {
211211 print (" tCgA:\t " ); print (tCgA); print (" \n " ); // tCgA: gmem_ptr[16b](GMEM_ADDR_A + offset_for_mma_tile + offset_for_mma) o ((_128,_16),_1,_4,4):((256,_1),_0,_16,_64)
212- print (" tCgB:\t " ); print (tCgB); print (" \n " ); // tCgB: gmem_ptr[16b](GMEM_ADDR_B + offset_for_mma_tile + offset_for_mma) o ((_256,_16),_1,_4,4):((_1, 256),_0,4096,16384 )
213- print (" tCgC:\t " ); print (tCgC); print (" \n " ); // tCgC: gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256 ,_1),_0,_0)
214- print (" tCgD:\t " ); print (tCgD); print (" \n " ); // tCgD: gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((256 ,_1),_0,_0)
212+ print (" tCgB:\t " ); print (tCgB); print (" \n " ); // tCgB: gmem_ptr[16b](GMEM_ADDR_B + offset_for_mma_tile + offset_for_mma) o ((_256,_16),_1,_4,4):((256,_1 ),_0,_16,_64 )
213+ print (" tCgC:\t " ); print (tCgC); print (" \n " ); // tCgC: gmem_ptr[32b](GMEM_ADDR_C + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((1024 ,_1),_0,_0)
214+ print (" tCgD:\t " ); print (tCgD); print (" \n " ); // tCgD: gmem_ptr[32b](GMEM_ADDR_D + offset_for_mma_tile + offset_for_mma) o ((_128,_256),_1,_1):((1024 ,_1),_0,_0)
215215 } __syncthreads ();
216216
217217 // MMA Fragment Allocation
0 commit comments