@@ -18,13 +18,21 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
18
18
%4 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%pitch , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x64 xf16 , #dot_b >>
19
19
%5 = tt.load %3 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
20
20
%6 = tt.load %4 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
21
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
22
+ tt.store %3 , %5 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
23
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
24
+ tt.store %4 , %6 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
21
25
22
26
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
23
27
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, ttig.block_io = "column_major"}
24
28
%7 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%c1_i64 , %pitch ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <64 x32 xf16 , #dot_a >>
25
29
%8 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%c1_i64 , %pitch ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <32 x64 xf16 , #dot_b >>
26
30
%9 = tt.load %7 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
27
31
%10 = tt.load %8 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
32
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
33
+ tt.store %7 , %9 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
34
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "column_major"}
35
+ tt.store %8 , %10 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
28
36
29
37
// COM: Non-constant stride on fast changing dim.
30
38
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -33,6 +41,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
33
41
%12 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%pitch , %pitch ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <32 x64 xf16 , #dot_b >>
34
42
%13 = tt.load %11 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
35
43
%14 = tt.load %12 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
44
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
45
+ tt.store %11 , %13 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
46
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
47
+ tt.store %12 , %14 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
36
48
37
49
// COM: Non-64 divisible pitch.
38
50
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -41,6 +53,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
41
53
%16 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%c1_i64 , %pitch_odd ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <32 x64 xf16 , #dot_b >>
42
54
%17 = tt.load %15 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
43
55
%18 = tt.load %16 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
56
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
57
+ tt.store %15 , %17 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
58
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
59
+ tt.store %16 , %18 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
44
60
45
61
// COM: Non 4 bytes aligned base.
46
62
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -49,6 +65,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
49
65
%20 = tt.make_tensor_ptr %arg1 , [%c0_i64 , %c0_i64 ], [%pitch , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x64 xf16 , #dot_b >>
50
66
%21 = tt.load %19 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
51
67
%22 = tt.load %20 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
68
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
69
+ tt.store %19 , %21 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
70
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
71
+ tt.store %20 , %22 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
52
72
53
73
// COM: Non 4 bytes aligned baseWidth.
54
74
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -57,6 +77,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
57
77
%24 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c15_i64 ], [%pitch , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x64 xf16 , #dot_b >>
58
78
%25 = tt.load %23 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
59
79
%26 = tt.load %24 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
80
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
81
+ tt.store %23 , %25 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
82
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
83
+ tt.store %24 , %26 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
60
84
61
85
// COM: Non 4 bytes aligned offsetX.
62
86
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -65,6 +89,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
65
89
%28 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%pitch , %c1_i64 ], [%c0_i32 , %c15_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x64 xf16 , #dot_b >>
66
90
%29 = tt.load %27 {boundaryCheck = array<i32 : 1 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
67
91
%30 = tt.load %28 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
92
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
93
+ tt.store %27 , %29 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <64 x32 xf16 , #dot_a >>
94
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
95
+ tt.store %28 , %30 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <32 x64 xf16 , #dot_b >>
96
+
68
97
tt.return
69
98
}
70
99
}
@@ -103,6 +132,8 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
103
132
// COM: 4 bytes aligned base (value got from addptr, addi, muli), baseWidth and offsetX (value got from muli).
104
133
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, ttig.block_io = "row_major"}
105
134
%11 = tt.load %10 {boundaryCheck = array<i32 : 0 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , padding = 1 : i32 } : !tt.ptr <tensor <128 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 1 }>>>
135
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
136
+ tt.store %10 , %11 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <128 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 1 }>>>
106
137
tt.return
107
138
}
108
139
}
@@ -130,6 +161,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
130
161
%14 = tt.advance %12 , [%1 , %arg3 ] : <tensor <8 x128 xf32 , #blocked >>
131
162
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"}
132
163
%15 = tt.load %14 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <8 x128 xf32 , #blocked >>
164
+ // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
165
+ tt.store %14 , %15 {boundaryCheck = array<i32 : 1 >} : !tt.ptr <tensor <8 x128 xf32 , #blocked >>
133
166
scf.yield %12 : !tt.ptr <tensor <8 x128 xf32 , #blocked >>
134
167
}
135
168
tt.return
0 commit comments