@@ -335,3 +335,53 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32
335335 tt.return
336336 }
337337}
338+
339+ // -----
340+
341+ // COM: Case 5:
342+ // COM: Check that a make tensor ptr with no loads is handled properly
343+ // CHECK: #[[DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
344+ #dpas = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [8 , 4 ], repCluster = [4 , 2 ], A = [32 , 16 ], B = [16 , 32 ], C = [32 , 32 ]}>
345+ module attributes {" triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 32 : i32 , triton_gpu.target = " xpu" , " triton_gpu.threads-per-warp" = 16 : i32 , triton_intel_gpu.min_sg_size = 16 : i32 , triton_intel_gpu.support_bf16_conversion , triton_intel_gpu.support_dpas , triton_intel_gpu.support_sg_2d_block } {
346+ tt.func public @matmul_kernel_with_block_pointers (%arg0: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }, %arg1: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }, %arg2: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }) attributes {noinline = false } {
347+ // CHECK: @matmul_kernel_with_block_pointers
348+ %c4_i32 = arith.constant 4 : i32
349+ %c256_i32 = arith.constant 256 : i32
350+ %c1024_i64 = arith.constant 1024 : i64
351+ %c5120_i64 = arith.constant 5120 : i64
352+ %c1_i64 = arith.constant 1 : i64
353+ %c0_i32 = arith.constant 0 : i32
354+ %c4096_i64 = arith.constant 4096 : i64
355+ %c32_i32 = arith.constant 32 : i32
356+ %c64_i32 = arith.constant 64 : i32
357+ %c5120_i32 = arith.constant 5120 : i32
358+ %cst = arith.constant dense <0.000000e+00 > : tensor <256 x256 xf32 , #dpas >
359+ %0 = tt.get_program_id x : i32
360+ %1 = arith.divsi %0 , %c64_i32 : i32
361+ %2 = arith.muli %1 , %c4_i32 : i32
362+ %3 = arith.subi %c4_i32 , %2 : i32
363+ %4 = arith.minsi %3 , %c4_i32 : i32
364+ %5 = arith.remsi %0 , %4 : i32
365+ %6 = arith.addi %2 , %5 : i32
366+ %7 = arith.remsi %0 , %c64_i32 : i32
367+ %8 = arith.divsi %7 , %4 : i32
368+ %9 = arith.muli %6 , %c256_i32 : i32
369+ // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
370+ %10 = tt.make_tensor_ptr %arg0 , [%c1024_i64 , %c5120_i64 ], [%c5120_i64 , %c1_i64 ], [%9 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <256 x32 xf16 , #triton_gpu.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 2 }>>>
371+ %11 = arith.muli %8 , %c256_i32 : i32
372+ // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 0, 1>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
373+ %12 = tt.make_tensor_ptr %arg1 , [%c5120_i64 , %c4096_i64 ], [%c1_i64 , %c5120_i64 ], [%c0_i32 , %11 ] {order = array<i32 : 0 , 1 >} : <tensor <32 x256 xf16 , #triton_gpu.dot_op <{opIdx = 1 , parent = #dpas , kWidth = 2 }>>>
374+ %13:3 = scf.for %arg3 = %c0_i32 to %c5120_i32 step %c32_i32 iter_args (%arg4 = %cst , %arg5 = %10 , %arg6 = %12 ) -> (tensor <256 x256 xf32 , #dpas >, !tt.ptr <tensor <256 x32 xf16 , #triton_gpu.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 2 }>>>, !tt.ptr <tensor <32 x256 xf16 , #triton_gpu.dot_op <{opIdx = 1 , parent = #dpas , kWidth = 2 }>>>) : i32 {
375+ // CHECK: tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
376+ // CHECK: tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
377+ %19 = tt.advance %arg5 , [%c0_i32 , %c32_i32 ] : <tensor <256 x32 xf16 , #triton_gpu.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 2 }>>>
378+ %20 = tt.advance %arg6 , [%c32_i32 , %c0_i32 ] : <tensor <32 x256 xf16 , #triton_gpu.dot_op <{opIdx = 1 , parent = #dpas , kWidth = 2 }>>>
379+ scf.yield %arg4 , %19 , %20 : tensor <256 x256 xf32 , #dpas >, !tt.ptr <tensor <256 x32 xf16 , #triton_gpu.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 2 }>>>, !tt.ptr <tensor <32 x256 xf16 , #triton_gpu.dot_op <{opIdx = 1 , parent = #dpas , kWidth = 2 }>>>
380+ }
381+ %14 = tt.make_tensor_ptr %arg2 , [%c1024_i64 , %c4096_i64 ], [%c4096_i64 , %c1_i64 ], [%9 , %11 ] {order = array<i32 : 1 , 0 >} : <tensor <256 x256 xf16 , #dpas >>
382+ %15 = arith.truncf %13#0 : tensor <256 x256 xf32 , #dpas > to tensor <256 x256 xf16 , #dpas >
383+ // CHECK: tt.store {{.*}}, {{.*}}, {{.*}} : !tt.ptr<tensor<256x256xf16, #[[DPAS]]>
384+ tt.store %14 , %15 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <256 x256 xf16 , #dpas >>
385+ tt.return
386+ }
387+ }
0 commit comments