@@ -49,6 +49,16 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
4949 }
5050}
5151
52+ // -----
53+
54+ #blocked = #triton_gpu.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [8 , 8 ], warpsPerCTA = [1 , 1 ], order = [1 , 0 ]}>
55+ #blocked1 = #triton_gpu.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [8 , 8 ], warpsPerCTA = [1 , 1 ], order = [0 , 1 ]}>
56+ #mma = #triton_gpu.amd_mfma <{versionMajor = 3 , versionMinor = 0 , warpsPerCTA = [1 , 1 ], instrShape = [16 , 16 ], isTransposed = true }>
57+ #shared = #triton_gpu.shared <{vec = 8 , perPhase = 1 , maxPhase = 8 , order = [1 , 0 ], hasLeadingOffset = false }>
58+ #shared1 = #triton_gpu.shared <{vec = 8 , perPhase = 1 , maxPhase = 8 , order = [0 , 1 ], hasLeadingOffset = false }>
59+ #dotOp0 = #triton_gpu.dot_op <{opIdx = 0 , parent = #mma , kWidth = 8 }>
60+ #dotOp1 = #triton_gpu.dot_op <{opIdx = 1 , parent = #mma , kWidth = 8 }>
61+
5262// Should apply: tile size 256x256x64 with single dot
5363// CHECK-LABEL: sink_2nd_load_256x256x64
5464// CHECK: %[[tileA:.*]] = tt.load
@@ -78,6 +88,16 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
7888 }
7989}
8090
91+ // -----
92+
93+ #blocked = #triton_gpu.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [8 , 8 ], warpsPerCTA = [1 , 1 ], order = [1 , 0 ]}>
94+ #blocked1 = #triton_gpu.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [8 , 8 ], warpsPerCTA = [1 , 1 ], order = [0 , 1 ]}>
95+ #mma = #triton_gpu.amd_mfma <{versionMajor = 3 , versionMinor = 0 , warpsPerCTA = [1 , 1 ], instrShape = [16 , 16 ], isTransposed = true }>
96+ #shared = #triton_gpu.shared <{vec = 8 , perPhase = 1 , maxPhase = 8 , order = [1 , 0 ], hasLeadingOffset = false }>
97+ #shared1 = #triton_gpu.shared <{vec = 8 , perPhase = 1 , maxPhase = 8 , order = [0 , 1 ], hasLeadingOffset = false }>
98+ #dotOp0 = #triton_gpu.dot_op <{opIdx = 0 , parent = #mma , kWidth = 8 }>
99+ #dotOp1 = #triton_gpu.dot_op <{opIdx = 1 , parent = #mma , kWidth = 8 }>
100+
81101// Should NOT apply: tile size 256x64x128 with single dot
82102// CHECK-LABEL: sink_2nd_load_256x64x128
83103// CHECK: %[[tileA:.*]] = tt.load
@@ -107,6 +127,16 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
107127 }
108128}
109129
130+ // -----
131+
132+ #blocked = #triton_gpu.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [8 , 8 ], warpsPerCTA = [1 , 1 ], order = [1 , 0 ]}>
133+ #blocked1 = #triton_gpu.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [8 , 8 ], warpsPerCTA = [1 , 1 ], order = [0 , 1 ]}>
134+ #mma = #triton_gpu.amd_mfma <{versionMajor = 3 , versionMinor = 0 , warpsPerCTA = [1 , 1 ], instrShape = [16 , 16 ], isTransposed = true }>
135+ #shared = #triton_gpu.shared <{vec = 8 , perPhase = 1 , maxPhase = 8 , order = [1 , 0 ], hasLeadingOffset = false }>
136+ #shared1 = #triton_gpu.shared <{vec = 8 , perPhase = 1 , maxPhase = 8 , order = [0 , 1 ], hasLeadingOffset = false }>
137+ #dotOp0 = #triton_gpu.dot_op <{opIdx = 0 , parent = #mma , kWidth = 8 }>
138+ #dotOp1 = #triton_gpu.dot_op <{opIdx = 1 , parent = #mma , kWidth = 8 }>
139+
110140// Should NOT apply: tile size 256x256x32 with single dot
111141// CHECK-LABEL: sink_2nd_load_256x256x32
112142// CHECK: %[[tileA:.*]] = tt.load
@@ -136,6 +166,15 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
136166 }
137167}
138168
169+ // -----
170+
171+ #blocked = #triton_gpu.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [8 , 8 ], warpsPerCTA = [1 , 1 ], order = [1 , 0 ]}>
172+ #blocked1 = #triton_gpu.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [8 , 8 ], warpsPerCTA = [1 , 1 ], order = [0 , 1 ]}>
173+ #mma = #triton_gpu.amd_mfma <{versionMajor = 3 , versionMinor = 0 , warpsPerCTA = [1 , 1 ], instrShape = [16 , 16 ], isTransposed = true }>
174+ #shared = #triton_gpu.shared <{vec = 8 , perPhase = 1 , maxPhase = 8 , order = [1 , 0 ], hasLeadingOffset = false }>
175+ #shared1 = #triton_gpu.shared <{vec = 8 , perPhase = 1 , maxPhase = 8 , order = [0 , 1 ], hasLeadingOffset = false }>
176+ #dotOp0 = #triton_gpu.dot_op <{opIdx = 0 , parent = #mma , kWidth = 8 }>
177+ #dotOp1 = #triton_gpu.dot_op <{opIdx = 1 , parent = #mma , kWidth = 8 }>
139178
140179// Category 2: single dot with two loads and tile size is large enough (128x128x128).
141180// We make sure the move is legal.
0 commit comments