@@ -146,49 +146,49 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
146146 // CHECK: [[BLOCKED_LAYOUT2:#.*]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [4, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
147147 // CHECK: @test_block_ptrs
148148 tt.func public @test_block_ptrs (%arg0: !tt.ptr <f8E5M2 > {tt.divisibility = 16 : i32 }, %arg1: !tt.ptr <f8E5M2 > {tt.divisibility = 16 : i32 }, %arg2: !tt.ptr <f8E5M2 > {tt.divisibility = 16 : i32 }, %arg3: f32 , %arg4: !tt.ptr <f32 > {tt.divisibility = 16 : i32 }, %arg5: !tt.ptr <f8E5M2 > {tt.divisibility = 16 : i32 }, %arg6: i32 {tt.divisibility = 16 : i32 }, %arg7: i32 {tt.divisibility = 16 : i32 }, %arg8: i32 {tt.divisibility = 16 : i32 }, %arg9: i32 , %arg10: i32 , %arg11: i32 {tt.divisibility = 16 : i32 }, %arg12: i32 , %arg13: i32 , %arg14: i32 , %arg15: i32 , %arg16: i32 , %arg17: i32 {tt.divisibility = 16 : i32 }, %arg18: i32 , %arg19: i32 , %arg20: i32 ) {
149- %cst = arith.constant dense <0.000000e+00 > : tensor <8 x16 xf32 , #dpas >
150- %cst_0 = arith.constant dense <0.000000e+00 > : tensor <8 xf32 , #blocked >
151- %cst_1 = arith.constant dense <0xFF800000 > : tensor <8 xf32 , #blocked >
152- %c1_i32 = arith.constant 1 : i32
153- %c16_i32 = arith.constant 16 : i32
154- %cst_2 = arith.constant dense <0.000000e+00 > : tensor <8 x64 xf32 , #blocked1 >
155- %c0_i32 = arith.constant 0 : i32
156- %c1_i64 = arith.constant 1 : i64
157- %c64_i64 = arith.constant 64 : i64
158- %c8_i32 = arith.constant 8 : i32
159- %0 = tt.get_program_id x : i32
160- %1 = tt.get_program_id y : i32
161- %2 = arith.divsi %1 , %arg19 : i32
162- %3 = arith.remsi %1 , %arg19 : i32
163- %4 = arith.extsi %2 : i32 to i64
164- %5 = arith.extsi %arg6 : i32 to i64
165- %6 = arith.muli %4 , %5 : i64
166- %7 = arith.extsi %3 : i32 to i64
167- %8 = arith.extsi %arg7 : i32 to i64
168- %9 = arith.muli %7 , %8 : i64
169- %10 = arith.addi %6 , %9 : i64
170- %11 = tt.addptr %arg0 , %10 : !tt.ptr <f8E5M2 >, i64
171- %12 = arith.muli %0 , %c8_i32 : i32
172- %13 = arith.extsi %arg20 : i32 to i64
173- %14 = arith.extsi %arg8 : i32 to i64
149+ %cst = arith.constant dense <0.000000e+00 > : tensor <8 x16 xf32 , #dpas >
150+ %cst_0 = arith.constant dense <0.000000e+00 > : tensor <8 xf32 , #blocked >
151+ %cst_1 = arith.constant dense <0xFF800000 > : tensor <8 xf32 , #blocked >
152+ %c1_i32 = arith.constant 1 : i32
153+ %c16_i32 = arith.constant 16 : i32
154+ %cst_2 = arith.constant dense <0.000000e+00 > : tensor <8 x64 xf32 , #blocked1 >
155+ %c0_i32 = arith.constant 0 : i32
156+ %c1_i64 = arith.constant 1 : i64
157+ %c64_i64 = arith.constant 64 : i64
158+ %c8_i32 = arith.constant 8 : i32
159+ %0 = tt.get_program_id x : i32
160+ %1 = tt.get_program_id y : i32
161+ %2 = arith.divsi %1 , %arg19 : i32
162+ %3 = arith.remsi %1 , %arg19 : i32
163+ %4 = arith.extsi %2 : i32 to i64
164+ %5 = arith.extsi %arg6 : i32 to i64
165+ %6 = arith.muli %4 , %5 : i64
166+ %7 = arith.extsi %3 : i32 to i64
167+ %8 = arith.extsi %arg7 : i32 to i64
168+ %9 = arith.muli %7 , %8 : i64
169+ %10 = arith.addi %6 , %9 : i64
170+ %11 = tt.addptr %arg0 , %10 : !tt.ptr <f8E5M2 >, i64
171+ %12 = arith.muli %0 , %c8_i32 : i32
172+ %13 = arith.extsi %arg20 : i32 to i64
173+ %14 = arith.extsi %arg8 : i32 to i64
174174 // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]>
175175 %15 = tt.make_tensor_ptr %11 , [%13 , %c64_i64 ], [%14 , %c1_i64 ], [%12 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <8 x64 xf8 E5 M2 , #dot1 >>
176- %16 = tt.addptr %arg1 , %10 : !tt.ptr <f8E5M2 >, i64
177- %17 = arith.extsi %arg11 : i32 to i64
176+ %16 = tt.addptr %arg1 , %10 : !tt.ptr <f8E5M2 >, i64
177+ %17 = arith.extsi %arg11 : i32 to i64
178178 // CHECK: [[PTR2:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>
179179 %18 = tt.make_tensor_ptr %16 , [%c64_i64 , %13 ], [%c1_i64 , %17 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <64 x16 xf8 E5 M2 , #dot2 >>
180- %19 = tt.addptr %arg5 , %10 : !tt.ptr <f8E5M2 >, i64
181- %20 = arith.extsi %arg17 : i32 to i64
180+ %19 = tt.addptr %arg5 , %10 : !tt.ptr <f8E5M2 >, i64
181+ %20 = arith.extsi %arg17 : i32 to i64
182182 // CHECK: [[PTR3:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]>
183183 %21 = tt.make_tensor_ptr %19 , [%13 , %c64_i64 ], [%20 , %c1_i64 ], [%12 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <8 x64 xf8 E5 M2 , #blocked1 >>
184184 %22 = tt.make_range {end = 8 : i32 , start = 0 : i32 } : tensor <8 xi32 , #blocked >
185- %23 = tt.splat %12 : i32 -> tensor <8 xi32 , #blocked >
186- %24 = arith.addi %23 , %22 : tensor <8 xi32 , #blocked >
185+ %23 = tt.splat %12 : i32 -> tensor <8 xi32 , #blocked >
186+ %24 = arith.addi %23 , %22 : tensor <8 xi32 , #blocked >
187187 // CHECK: [[LOAD1:%.*]] = tt.load [[PTR1]] : !tt.ptr<tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]>
188188 // CHECK-NEXT: triton_gpu.convert_layout [[LOAD1]] : tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]> -> tensor<8x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
189189 %25 = tt.load %15 : !tt.ptr <tensor <8 x64 xf8 E5 M2 , #dot1 >>
190- %26 = arith.addi %0 , %c1_i32 : i32
191- %27 = arith.muli %26 , %c8_i32 : i32
190+ %26 = arith.addi %0 , %c1_i32 : i32
191+ %27 = arith.muli %26 , %c8_i32 : i32
192192 // CHECK: [[ADVANCE1:%.*]] = tt.advance [[PTR2]], {{.*}} : <tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>
193193 %28 = tt.advance %18 , [%c0_i32 , %12 ] : <tensor <64 x16 xf8 E5 M2 , #dot2 >>
194194 // CHECK: [[RES:%.*:2]] = scf.for {{.*}} iter_args(%arg22 = %cst_1, %arg23 = [[ADVANCE1]]) -> (tensor<8xf32, #blocked>, !tt.ptr<tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>)
@@ -202,8 +202,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
202202 %40 = triton_gpu.convert_layout %39 : tensor <8 x16 xf32 , #dpas > -> tensor <8 x16 xf32 , #blocked2 >
203203 %41 = " tt.reduce" (%40 ) <{axis = 1 : i32 }> ({
204204 ^bb0 (%arg24: f32 , %arg25: f32 ):
205- %44 = arith.maxnumf %arg24 , %arg25 : f32
206- tt.reduce.return %44 : f32
205+ %44 = arith.maxnumf %arg24 , %arg25 : f32
206+ tt.reduce.return %44 : f32
207207 }) : (tensor <8 x16 xf32 , #blocked2 >) -> tensor <8 xf32 , #triton_gpu.slice <{dim = 1 , parent = #blocked2 }>>
208208 %42 = triton_gpu.convert_layout %41 : tensor <8 xf32 , #triton_gpu.slice <{dim = 1 , parent = #blocked2 }>> -> tensor <8 xf32 , #blocked >
209209 // CHECK: [[ADVANCE2:%.*]] = tt.advance %arg23, {{.*}} : <tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>
@@ -219,7 +219,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
219219 tt.store %34 , %30 : tensor <8 x!tt.ptr <f32 >, #blocked >
220220 %35 = tt.fp_to_fp %cst_2 , rounding = rtne : tensor <8 x64 xf32 , #blocked1 > -> tensor <8 x64 xf8 E5 M2 , #blocked1 >
221221 tt.store %21 , %35 : !tt.ptr <tensor <8 x64 xf8 E5 M2 , #blocked1 >>
222- tt.return
222+ tt.return
223223 }
224224}
225225
@@ -254,19 +254,19 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
254254 %13 = arith.extsi %arg19 : i32 to i64
255255 %19 = tt.addptr %arg1 , %10 : !tt.ptr <f8E5M2 >, i64
256256 %20 = arith.extsi %arg11 : i32 to i64
257- // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>
257+ // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>
258258 %21 = tt.make_tensor_ptr %19 , [%c64_i64 , %13 ], [%c1_i64 , %20 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <64 x32 xf8 E5 M2 , #dot2 >>
259259 // CHECK: [[RES:%.*]]:2 = scf.for {{.*}} iter_args(%arg6 = %cst, %arg7 = [[PTR1]]) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>)
260260 %33:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args (%arg22 = %cst_1 , %arg23 = %21 ) -> (tensor <64 xf32 , #triton_gpu.slice <{dim = 1 , parent = #dpas }>>, !tt.ptr <tensor <64 x32 xf8 E5 M2 , #dot2 >>) : i32 {
261261 // CHECK: [[LOAD:%.*]] = tt.load %arg7 : !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
262- // CHECK-NEXT: triton_gpu.convert_layout [[LOAD]] : tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]> -> tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
262+ // CHECK-NEXT: triton_gpu.convert_layout [[LOAD]] : tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]> -> tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
263263 // CHECK-NEXT: scf.yield %arg6, %arg7 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, #blocked>>
264264 %load = tt.load %arg23 : !tt.ptr <tensor <64 x32 xf8 E5 M2 , #dot2 >>
265265 scf.yield %arg22 , %arg23 : tensor <64 xf32 , #triton_gpu.slice <{dim = 1 , parent = #dpas }>>, !tt.ptr <tensor <64 x32 xf8 E5 M2 , #dot2 >>
266266 }
267267 // CHECK: scf.for {{.*}} iter_args(%arg6 = [[RES]]#0, %arg7 = [[RES]]#1) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>)
268268 %34:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args (%arg22 = %33#0 , %arg23 = %33#1 ) -> (tensor <64 xf32 , #triton_gpu.slice <{dim = 1 , parent = #dpas }>>, !tt.ptr <tensor <64 x32 xf8 E5 M2 , #dot2 >>) : i32 {
269- // CHECK: scf.yield %arg6, %arg7 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
269+ // CHECK: scf.yield %arg6, %arg7 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
270270 scf.yield %arg22 , %arg23 : tensor <64 xf32 , #triton_gpu.slice <{dim = 1 , parent = #dpas }>>, !tt.ptr <tensor <64 x32 xf8 E5 M2 , #dot2 >>
271271 }
272272 tt.return
0 commit comments