@@ -18,13 +18,21 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
1818 %4 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%pitch , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x64 xf16 , #dot_b >>
1919 %5 = tt.load %3 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
2020 %6 = tt.load %4 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
21+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
22+ tt.store %3 , %5 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
23+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
24+ tt.store %4 , %6 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
2125
2226 // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
2327 // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, ttig.block_io = "column_major"}
2428 %7 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%c1_i64 , %pitch ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <64 x32 xf16 , #dot_a >>
2529 %8 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%c1_i64 , %pitch ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <32 x64 xf16 , #dot_b >>
2630 %9 = tt.load %7 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
2731 %10 = tt.load %8 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
32+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
33+ tt.store %7 , %9 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
34+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "column_major"}
35+ tt.store %8 , %10 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
2836
2937 // COM: Non-constant stride on fast changing dim.
3038 // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -33,6 +41,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
3341 %12 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%pitch , %pitch ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <32 x64 xf16 , #dot_b >>
3442 %13 = tt.load %11 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
3543 %14 = tt.load %12 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
44+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
45+ tt.store %11 , %13 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
46+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
47+ tt.store %12 , %14 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
3648
3749 // COM: Non-64 divisible pitch.
3850 // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -41,6 +53,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
4153 %16 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%c1_i64 , %pitch_odd ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <32 x64 xf16 , #dot_b >>
4254 %17 = tt.load %15 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
4355 %18 = tt.load %16 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
56+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
57+ tt.store %15 , %17 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
58+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
59+ tt.store %16 , %18 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
4460
4561 // COM: Non 4 bytes aligned base.
4662 // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -49,6 +65,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
4965 %20 = tt.make_tensor_ptr %arg1 , [%c0_i64 , %c0_i64 ], [%pitch , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x64 xf16 , #dot_b >>
5066 %21 = tt.load %19 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
5167 %22 = tt.load %20 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
68+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
69+ tt.store %19 , %21 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
70+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
71+ tt.store %20 , %22 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
5272
5373 // COM: Non 4 bytes aligned baseWidth.
5474 // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -57,6 +77,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
5777 %24 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c15_i64 ], [%pitch , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x64 xf16 , #dot_b >>
5878 %25 = tt.load %23 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
5979 %26 = tt.load %24 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
80+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
81+ tt.store %23 , %25 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
82+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
83+ tt.store %24 , %26 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
6084
6185 // COM: Non 4 bytes aligned offsetX.
6286 // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -65,6 +89,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
6589 %28 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%pitch , %c1_i64 ], [%c0_i32 , %c15_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x64 xf16 , #dot_b >>
6690 %29 = tt.load %27 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
6791 %30 = tt.load %28 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
92+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
93+ tt.store %27 , %29 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
94+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
95+ tt.store %28 , %30 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
96+
6897 tt.return
6998 }
7099}
@@ -103,6 +132,8 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
103132 // COM: 4 bytes aligned base (value got from addptr, addi, muli), baseWidth and offsetX (value got from muli).
104133 // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, ttig.block_io = "row_major"}
105134 %11 = tt.load %10 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <128 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 1 }>>>
135+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
136+ tt.store %10 , %11 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <128 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 1 }>>>
106137 tt.return
107138 }
108139}
@@ -130,6 +161,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
130161 %14 = tt.advance %12 , [%1 , %arg3 ] : <tensor <8 x128 xf32 , #blocked >>
131162 // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"}
132163 %15 = tt.load %14 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <8 x128 xf32 , #blocked >>
164+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
165+ tt.store %14 , %15 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <8 x128 xf32 , #blocked >>
133166 scf.yield %12 : !tt.ptr <tensor <8 x128 xf32 , #blocked >>
134167 }
135168 tt.return
0 commit comments