@@ -14,7 +14,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
1414 %cst = arith.constant dense <0.000000e+00 > : tensor <128 x128 xf32 , #blocked >
1515 %cst0 = arith.constant dense <0.000000e+00 > : tensor <128 x128 xf16 , #blocked >
1616 %cst1 = arith.constant dense <0.000000e+00 > : tensor <64 x64 xf16 , #blocked >
17- %cst2 = arith.constant dense <0.000000e+00 > : tensor <64 x 256 x f16 , #blocked >
17+ %cst2 = arith.constant dense <0.000000e+00 > : tensor <64 x 128 x f16 , #blocked >
1818 %cst3 = arith.constant dense <0 > : tensor <64 x4 xi8 , #linear >
1919 %cst4 = arith.constant dense <0.000000e+00 > : tensor <64 x128 xf16 , #blocked >
2020
@@ -39,8 +39,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
3939 // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 128 : i32, tensor_memory_row_offset = 0 : i32}
4040 %6 = ttng.tmem_alloc %cst : (tensor <128 x128 xf32 , #blocked >) -> !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >
4141
42- ttng.tmem_store %cst2 , %4 , %true : tensor <64 x 256 x f16 , #blocked > -> !ttg.memdesc <64 x128 xf16 , #tmem2 , #ttng.tensor_memory , mutable >
43- ttng.tmem_store %cst2 , %5 , %true : tensor <64 x 256 x f16 , #blocked > -> !ttg.memdesc <64 x128 xf16 , #tmem2 , #ttng.tensor_memory , mutable >
42+ ttng.tmem_store %cst2 , %4 , %true : tensor <64 x 128 x f16 , #blocked > -> !ttg.memdesc <64 x128 xf16 , #tmem2 , #ttng.tensor_memory , mutable >
43+ ttng.tmem_store %cst2 , %5 , %true : tensor <64 x 128 x f16 , #blocked > -> !ttg.memdesc <64 x128 xf16 , #tmem2 , #ttng.tensor_memory , mutable >
4444 ttng.tmem_store %cst , %6 , %true : tensor <128 x128 xf32 , #blocked > -> !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >
4545
4646 %7 = ttng.tmem_alloc : () -> !ttg.memdesc <64 x4 xi8 , #tmem_scales , #ttng.tensor_memory , mutable >
0 commit comments