@@ -125,10 +125,10 @@ module attributes {"triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-wa
125125 // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
126126 %22 = tt.make_tensor_ptr %arg1 , [%16 , %20 ], [%21 , %c1_i64 ], [%c0_i32 , %19 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x256 xf16 , #dot1 >>
127127 %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <256 x256 xf32 , #dpas >, !tt.ptr <tensor <256 x32 xf16 , #dot0 >>, !tt.ptr <tensor <32 x256 xf16 , #dot1 >>) : i32 {
128- // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
129- // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
130- %28 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <256 x32 xf16 , #dot0 >>
131- %29 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <32 x256 xf16 , #dot1 >>
128+ // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major" } : !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
129+ // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major" } : !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
130+ %28 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <256 x32 xf16 , #dot0 >>
131+ %29 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <32 x256 xf16 , #dot1 >>
132132 // CHECK: tt.dot {{.*}}, {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>> * tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>> -> tensor<256x256xf32, #[[DPAS]]>
133133 // CHECK: tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
134134 // CHECK: tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
@@ -150,6 +150,73 @@ module attributes {"triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-wa
150150
151151// -----
152152
153+ // COM: Case 2:
154+ // COM: Check that operations using block pointers without divisibility attribute are rewritten to use a legacy pointer.
155+ // CHECK: #[[DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
156+ #blocked = #triton_gpu.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [4 , 16 ], order = [1 , 0 ]}>
157+ #dpas = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [16 , 4 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
158+ #dot0 = #triton_gpu.dot_op <{opIdx = 0 , parent = #dpas , kWidth =2 }>
159+ #dot1 = #triton_gpu.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>
160+ module attributes {" triton_gpu.num-warps" = 64 : i32 , " triton_gpu.threads-per-warp" = 16 : i32 , " triton_intel_gpu.support_sg_2d_block" } {
161+ tt.func public @matmul_kernel_with_block_pointers_indivisible (%arg0: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }, %arg1: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }, %arg2: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }, %arg3: i32 {tt.divisibility = 16 : i32 }, %arg4: i32 {tt.divisibility = 16 : i32 }, %arg5: i32 {tt.divisibility = 16 : i32 }, %arg6: i32 , %arg7: i32 , %arg8: i32 {tt.divisibility = 16 : i32 }) {
162+ // CHECK: @matmul_kernel_with_block_pointers_indivisible
163+ %c4_i32 = arith.constant 4 : i32
164+ %c256_i32 = arith.constant 256 : i32
165+ %c1_i64 = arith.constant 1 : i64
166+ %c0_i32 = arith.constant 0 : i32
167+ %c32_i32 = arith.constant 32 : i32
168+ %c255_i32 = arith.constant 255 : i32
169+ %cst = arith.constant dense <0.000000e+00 > : tensor <256 x256 xf32 , #dpas >
170+ %0 = tt.get_program_id x : i32
171+ %1 = arith.addi %arg3 , %c255_i32 : i32
172+ %2 = arith.divsi %1 , %c256_i32 : i32
173+ %3 = arith.addi %arg4 , %c255_i32 : i32
174+ %4 = arith.divsi %3 , %c256_i32 : i32
175+ %5 = arith.muli %4 , %c4_i32 : i32
176+ %6 = arith.divsi %0 , %5 : i32
177+ %7 = arith.muli %6 , %c4_i32 : i32
178+ %8 = arith.subi %2 , %7 : i32
179+ %9 = arith.minsi %8 , %c4_i32 : i32
180+ %10 = arith.remsi %0 , %9 : i32
181+ %11 = arith.addi %7 , %10 : i32
182+ %12 = arith.remsi %0 , %5 : i32
183+ %13 = arith.divsi %12 , %9 : i32
184+ %14 = arith.muli %11 , %c256_i32 : i32
185+ %15 = arith.extsi %arg3 : i32 to i64
186+ %16 = arith.extsi %arg5 : i32 to i64
187+ %17 = arith.extsi %arg6 : i32 to i64
188+ // CHECK-NOT: tt.make_tensor_ptr
189+ %18 = tt.make_tensor_ptr %arg0 , [%15 , %16 ], [%17 , %c1_i64 ], [%14 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <256 x32 xf16 , #dot0 >>
190+ %19 = arith.muli %13 , %c256_i32 : i32
191+ %20 = arith.extsi %arg4 : i32 to i64
192+ %21 = arith.extsi %arg7 : i32 to i64
193+ // CHECK-NOT: tt.make_tensor_ptr
194+ %22 = tt.make_tensor_ptr %arg1 , [%16 , %20 ], [%21 , %c1_i64 ], [%c0_i32 , %19 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x256 xf16 , #dot1 >>
195+ %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <256 x256 xf32 , #dpas >, !tt.ptr <tensor <256 x32 xf16 , #dot0 >>, !tt.ptr <tensor <32 x256 xf16 , #dot1 >>) : i32 {
196+ // CHECK: tt.load {{.*}}, {{.*}} : tensor<256x32x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>
197+ // CHECK: tt.load {{.*}}, {{.*}} : tensor<32x256x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>
198+ %28 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <256 x32 xf16 , #dot0 >>
199+ %29 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <32 x256 xf16 , #dot1 >>
200+ %30 = tt.dot %28 , %29 , %arg10 , inputPrecision = tf32 : tensor <256 x32 xf16 , #dot0 > * tensor <32 x256 xf16 , #dot1 > -> tensor <256 x256 xf32 , #dpas >
201+ // CHECK-NOT: tt.advance
202+ %31 = tt.advance %arg11 , [%c0_i32 , %c32_i32 ] : <tensor <256 x32 xf16 , #dot0 >>
203+ // CHECK-NOT: tt.advance
204+ %32 = tt.advance %arg12 , [%c32_i32 , %c0_i32 ] : <tensor <32 x256 xf16 , #dot1 >>
205+ scf.yield %30 , %31 , %32 : tensor <256 x256 xf32 , #dpas >, !tt.ptr <tensor <256 x32 xf16 , #dot0 >>, !tt.ptr <tensor <32 x256 xf16 , #dot1 >>
206+ }
207+ %24 = arith.truncf %23#0 : tensor <256 x256 xf32 , #dpas > to tensor <256 x256 xf16 , #dpas >
208+ %25 = triton_gpu.convert_layout %24 : tensor <256 x256 xf16 , #dpas > -> tensor <256 x256 xf16 , #blocked >
209+ %26 = arith.extsi %arg8 : i32 to i64
210+ // CHECK-NOT: tt.make_tensor_ptr
211+ %27 = tt.make_tensor_ptr %arg2 , [%15 , %20 ], [%26 , %c1_i64 ], [%14 , %19 ] {order = array<i32 : 1 , 0 >} : <tensor <256 x256 xf16 , #blocked >>
212+ // CHECK: tt.store {{.*}}, {{.*}}, {{.*}} : tensor<256x256x!tt.ptr<f16>, #[[BLOCKED]]>
213+ tt.store %27 , %25 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <256 x256 xf16 , #blocked >>
214+ tt.return
215+ }
216+ }
217+
218+ // -----
219+
153220// COM: Case 3:
154221// COM: Check that operations using block pointers without a layout attribute are rewritten to use a legacy pointer.
155222module attributes {" triton_intel_gpu.support_sg_2d_block" } {
0 commit comments