[triton-raise-block-ptr]: Fix failing test (#3219)

etiotto · web-flow · commit 686a8c14b626 · 2025-01-21T17:40:41.000-05:00
Fixes #3218 --------- Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
diff --git a/test/Triton/Intel/RaiseToBlockPointers/addptr_cmpge.mlir b/test/Triton/Intel/RaiseToBlockPointers/addptr_cmpge.mlir
@@ -1,5 +1,4 @@
 // RUN: triton-opt %s -triton-raise-block-pointer --split-input-file -canonicalize | FileCheck %s
-// XFAIL: *
 
 // These tests check that loads/stores that exhibit a cmp ge against 0 work
 // correctly with the pointer analysis pass
@@ -45,7 +44,10 @@ tt.func public @test_masked_load(%arg0: !tt.ptr<f16>) -> tensor<16x16xf16> {
 }
 
 // CHECK:         tt.func public @test_masked_load([[arg0:%.+]]: !tt.ptr<f16>) -> tensor<16x16xf16> {
-// CHECK:           [[VAR_0:%.+]] = tt.make_tensor_ptr [[arg0]], {{.*}} {order = array<i32>} : <tensor<16x16xf16>>
+// CHECK-DAG:       [[CST_0_i64:%.+]] = arith.constant 0 : i64
+// CHECK-DAG:       [[CST_1_i64:%.+]] = arith.constant 1 : i64
+// CHECK-DAG:       [[CST_0_i32:%.+]] = arith.constant 0 : i32
+// CHECK:           [[VAR_0:%.+]] = tt.make_tensor_ptr [[arg0]], {{\[}}[[CST_0_i64]], [[CST_0_i64]]], {{\[}}[[CST_1_i64]], [[CST_0_i64]]], {{\[}}[[CST_0_i32]], [[CST_0_i32]]] {{.*}} : <tensor<16x16xf16>>
 // CHECK:           [[VAR_1:%.+]] = tt.load [[VAR_0]] evictionPolicy = evict_last : !tt.ptr<tensor<16x16xf16>>
 // CHECK:           tt.return [[VAR_1]] : tensor<16x16xf16>
 // CHECK:         }
@@ -71,6 +73,9 @@ tt.func public @test_masked_store(%arg0: !tt.ptr<f16>) {
 
 // CHECK:         tt.func public @test_masked_store([[arg0:%.+]]: !tt.ptr<f16>) {
 // CHECK-DAG:       [[VAR_cst:%.+]] = arith.constant dense<1.500000e+01> : tensor<16x16xf16>
-// CHECK-DAG:       [[VAR_0:%.+]] = tt.make_tensor_ptr [[arg0]], {{.*}} {order = array<i32>} : <tensor<16x16xf16>>
+// CHECK-DAG:       [[CST_0_i64:%.+]] = arith.constant 0 : i64
+// CHECK-DAG:       [[CST_1_i64:%.+]] = arith.constant 1 : i64
+// CHECK-DAG:       [[CST_0_i32:%.+]] = arith.constant 0 : i32
+// CHECK:           [[VAR_0:%.+]] = tt.make_tensor_ptr [[arg0]], {{\[}}[[CST_0_i64]], [[CST_0_i64]]], {{\[}}[[CST_1_i64]], [[CST_0_i64]]], {{\[}}[[CST_0_i32]], [[CST_0_i32]]] {{.*}} : <tensor<16x16xf16>>
 // CHECK:           tt.store [[VAR_0]], [[VAR_cst]] : !tt.ptr<tensor<16x16xf16>>
 // CHECK:         }
diff --git a/test/Triton/Intel/RaiseToBlockPointers/addptr_for_expand_ptr.mlir b/test/Triton/Intel/RaiseToBlockPointers/addptr_for_expand_ptr.mlir
@@ -1,5 +1,6 @@
-// RUN: triton-shared-opt --triton-to-structured --remove-dead-values --canonicalize %s | FileCheck %s
+// RUN: triton-opt %s -triton-raise-block-pointer -canonicalize | FileCheck %s
 // XFAIL: *
+// TODO: add support for tt.expand_dims in loops
 
 module {
   tt.func @kernel(
diff --git a/test/Triton/Intel/RaiseToBlockPointers/kernel-02-fused-softmax.mlir b/test/Triton/Intel/RaiseToBlockPointers/kernel-02-fused-softmax.mlir
@@ -1,5 +1,4 @@
 // RUN: triton-opt %s -triton-raise-block-pointer -canonicalize | FileCheck %s
-// XFAIL: *
 
 module {
   tt.func public @softmax_kernel_012345(%arg0: !tt.ptr<f32>, %arg1: !tt.ptr<f32>, %arg2: i32, %arg3: i32, %arg4: i32) {
@@ -15,7 +14,7 @@ module {
     %8 = tt.splat %cst : f32 -> tensor<128xf32>
     // TODO: add back once masked loads are supported
     // %9 = tt.load %5, %7, %8 : tensor<128x!tt.ptr<f32>>
-    %9 = tt.load %5, %7 : tensor<128x!tt.ptr<f32>>
+    %9 = tt.load %5 : tensor<128x!tt.ptr<f32>>
     %10 = "tt.reduce"(%9) ({
     ^bb0(%arg5: f32, %arg6: f32):
       %21 = arith.cmpf ogt, %arg5, %arg6 : f32
@@ -48,35 +47,26 @@ module {
 // CHECK-DAG:       [[CST_1_i64:%.+]] = arith.constant 1 : i64
 // CHECK-DAG:       [[VAR_0_:%.+]] = tt.get_program_id x : i32
 // CHECK:           [[VAR_1_:%.+]] = arith.muli [[VAR_0_]], [[PARAM_2_]] : i32
-
-// CHECK:           [[VAR_2_:%.+]] = arith.index_cast [[VAR_1_]] : i32 to index
-// CHECK-DAG:       [[VAR_3_:%.+]] = tts.make_tptr [[PARAM_1_]] to sizes: [128], strides: [1], offsets: {{.}}[[VAR_2_]]{{.}}, shape: [0], order: [] : <f32> to tensor<128x!tt.ptr<f32>>
-// CHECK-DAG:       [[VAR_4_:%.+]] = arith.index_cast [[PARAM_4_]] : i32 to index
-// CHECK:           [[VAR_5_:%.+]] = arith.minsi [[VAR_4_]], [[CST_128_]] : index
-// CHECK:           [[VAR_6_:%.+]] = arith.maxsi [[VAR_5_]], [[CST_0_1_]] : index
-// CHECK:           [[VAR_7_:%.+]] = "tts.load"([[VAR_3_]], [[VAR_6_]], [[CST_0_]]) <{operandSegmentSizes = array<i32: 1, 1, 1>, static_mask_dims = array<i64: -9223372036854775808>}> : (tensor<128x!tt.ptr<f32>>, index, f32) -> tensor<128xf32>
-// CHECK:           [[VAR_8_:%.+]] = "tt.reduce"([[VAR_7_]]) <{axis = 0 : i32}> ({
+// CHECK:           [[VAR_2_:%.+]] = tt.make_tensor_ptr [[PARAM_1_]], {{\[}}[[CST_0_i64]]], {{\[}}[[CST_1_i64]]], {{\[}}[[VAR_1_]]] {{.*}} : <tensor<128xf32>>
+// CHECK:           [[VAR_3_:%.+]] = tt.load [[VAR_2_]] : !tt.ptr<tensor<128xf32>>
+// CHECK:           [[VAR_4_:%.+]] = "tt.reduce"([[VAR_3_]]) <{axis = 0 : i32}> ({
 // CHECK:           ^bb0([[IN_0_:%.+]]: f32, [[IN_1_:%.+]]: f32):
-// CHECK:             [[VAR_21_:%.+]] = arith.cmpf ogt, [[IN_0_]], [[IN_1_]] : f32
-// CHECK:             [[VAR_22_:%.+]] = arith.select [[VAR_21_]], [[IN_0_]], [[IN_1_]] : f32
-// CHECK:             tt.reduce.return [[VAR_22_]] : f32
+// CHECK:             [[VAR_13_:%.+]] = arith.cmpf ogt, [[IN_0_]], [[IN_1_]] : f32
+// CHECK:             [[VAR_14_:%.+]] = arith.select [[VAR_13_]], [[IN_0_]], [[IN_1_]] : f32
+// CHECK:             tt.reduce.return [[VAR_14_]] : f32
 // CHECK:           }) : (tensor<128xf32>) -> f32
-// CHECK:           [[VAR_9_:%.+]] = tt.splat [[VAR_8_]] : f32 -> tensor<128xf32>
-// CHECK:           [[VAR_10_:%.+]] = arith.subf [[VAR_7_]], [[VAR_9_]] : tensor<128xf32>
-// CHECK:           [[VAR_11_:%.+]] = math.exp [[VAR_10_]] : tensor<128xf32>
-// CHECK:           [[VAR_12_:%.+]] = "tt.reduce"([[VAR_11_]]) <{axis = 0 : i32}> ({
+// CHECK:           [[VAR_5_:%.+]] = tt.splat [[VAR_4_]] : f32 -> tensor<128xf32>
+// CHECK:           [[VAR_6_:%.+]] = arith.subf [[VAR_3_]], [[VAR_5_]] : tensor<128xf32>
+// CHECK:           [[VAR_7_:%.+]] = math.exp [[VAR_6_]] : tensor<128xf32>
+// CHECK:           [[VAR_8_:%.+]] = "tt.reduce"([[VAR_7_]]) <{axis = 0 : i32}> ({
 // CHECK:           ^bb0([[IN_2_:%.+]]: f32, [[IN_3_:%.+]]: f32):
-// CHECK:             [[VAR_21_1_:%.+]] = arith.addf [[IN_2_]], [[IN_3_]] : f32
-// CHECK:             tt.reduce.return [[VAR_21_1_]] : f32
+// CHECK:             [[VAR_13_1_:%.+]] = arith.addf [[IN_2_]], [[IN_3_]] : f32
+// CHECK:             tt.reduce.return [[VAR_13_1_]] : f32
 // CHECK:           }) : (tensor<128xf32>) -> f32
-// CHECK:           [[VAR_13_:%.+]] = tt.splat [[VAR_12_]] : f32 -> tensor<128xf32>
-// CHECK-DAG:       [[VAR_14_:%.+]] = arith.divf [[VAR_11_]], [[VAR_13_]] : tensor<128xf32>
-// CHECK-DAG:       [[VAR_15_:%.+]] = arith.muli [[VAR_0_]], [[PARAM_3_]] : i32
-// CHECK:           [[VAR_16_:%.+]] = arith.index_cast [[VAR_15_]] : i32 to index
-// CHECK-DAG:       [[VAR_17_:%.+]] = tts.make_tptr [[PARAM_0_]] to sizes: [128], strides: [1], offsets: {{.}}[[VAR_16_]]{{.}}, shape: [0], order: [] : <f32> to tensor<128x!tt.ptr<f32>>
-// CHECK-DAG:       [[VAR_18_:%.+]] = arith.index_cast [[PARAM_4_]] : i32 to index
-// CHECK:           [[VAR_19_:%.+]] = arith.minsi [[VAR_18_]], [[CST_128_]] : index
-// CHECK:           [[VAR_20_:%.+]] = arith.maxsi [[VAR_19_]], [[CST_0_1_]] : index
-// CHECK:           "tts.store"([[VAR_17_]], [[VAR_14_]], [[VAR_20_]]) <{static_mask_dims = array<i64: -9223372036854775808>}> : (tensor<128x!tt.ptr<f32>>, tensor<128xf32>, index) -> ()
+// CHECK:           [[VAR_9_:%.+]] = tt.splat [[VAR_8_]] : f32 -> tensor<128xf32>
+// CHECK-DAG:       [[VAR_10_:%.+]] = arith.divf [[VAR_7_]], [[VAR_9_]] : tensor<128xf32>
+// CHECK-DAG:       [[VAR_11_:%.+]] = arith.muli [[VAR_0_]], [[PARAM_3_]] : i32
+// CHECK:           [[VAR_12_:%.+]] = tt.make_tensor_ptr [[PARAM_0_]], {{\[}}[[CST_0_i64]]], {{\[}}[[CST_1_i64]]], {{\[}}[[VAR_11_]]] {{.*}} : <tensor<128xf32>>
+// CHECK:           tt.store [[VAR_12_]], [[VAR_10_]] : !tt.ptr<tensor<128xf32>>
 // CHECK:           tt.return
 // CHECK:         }
diff --git a/test/Triton/Intel/RaiseToBlockPointers/sign_extend_i32_to_i64.mlir b/test/Triton/Intel/RaiseToBlockPointers/sign_extend_i32_to_i64.mlir
@@ -1,5 +1,4 @@
 // RUN: triton-opt %s -triton-raise-block-pointer -canonicalize | FileCheck %s
-// XFAIL: *
 
 // IR from python/examples/sign_extend.py
 module {
@@ -16,7 +15,9 @@ module {
     %8 = arith.cmpi slt, %5, %7 : tensor<4xi64>
     %9 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<4x!tt.ptr<f32>>
     %10 = tt.addptr %9, %5 : tensor<4x!tt.ptr<f32>>, tensor<4xi64>
-    %11 = tt.load %10, %8, %cst : tensor<4x!tt.ptr<f32>>
+    %11 = tt.load %10 : tensor<4x!tt.ptr<f32>>
+    // TODO: uncomment once masked loads are supported
+    // %11 = tt.load %10, %8, %cst : tensor<4x!tt.ptr<f32>>
     %12 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<4x!tt.ptr<f32>>
     %13 = tt.addptr %12, %2 : tensor<4x!tt.ptr<f32>>, tensor<4xi32>
     tt.store %13, %11 : tensor<4x!tt.ptr<f32>>
@@ -25,18 +26,13 @@ module {
 }
 
 // CHECK:         tt.func public @sign_extend([[PARAM_0_:%.+]]: !tt.ptr<i32>, [[PARAM_1_:%.+]]: !tt.ptr<f32>, [[PARAM_2_:%.+]]: !tt.ptr<f32>, [[PARAM_3_:%.+]]: i32) attributes {noinline = false} {
-// CHECK-DAG:       [[CST_1_dot_100000_:%.+]] = arith.constant 1.100000e+01 : f32
-// CHECK-DAG:       [[CST_4_:%.+]] = arith.constant 4 : index
-// CHECK-DAG:       [[LOAD_PARAM_0_MEM_:%.+]] = tt.load [[PARAM_0_]] : !tt.ptr<i32>
-// CHECK:           [[VAR_1_:%.+]] = arith.index_cast [[LOAD_PARAM_0_MEM_]] : i32 to index
-// CHECK-DAG:       [[VAR_2_:%.+]] = tts.make_tptr [[PARAM_1_]] to sizes: [4], strides: [1], offsets: {{.}}[[VAR_1_]]{{.}}, shape: [0], order: [] : <f32> to tensor<4x!tt.ptr<f32>>
-// CHECK-DAG:       [[VAR_3_:%.+]] = arith.addi [[VAR_1_]], [[CST_4_]] : index
-// CHECK-DAG:       [[VAR_4_:%.+]] = arith.index_cast [[PARAM_3_]] : i32 to index
-// CHECK:           [[VAR_5_:%.+]] = arith.minsi [[VAR_3_]], [[VAR_4_]] : index
-// CHECK:           [[VAR_6_:%.+]] = arith.maxsi [[VAR_5_]], [[VAR_1_]] : index
-// CHECK:           [[VAR_7_:%.+]] = arith.subi [[VAR_6_]], [[VAR_1_]] : index
-// CHECK-DAG:       [[VAR_8_:%.+]] = "tts.load"([[VAR_2_]], [[VAR_7_]], [[CST_1_dot_100000_]]) <{operandSegmentSizes = array<i32: 1, 1, 1>, static_mask_dims = array<i64: -9223372036854775808>}> : (tensor<4x!tt.ptr<f32>>, index, f32) -> tensor<4xf32>
-// CHECK-DAG:       [[VAR_9_:%.+]] = tts.make_tptr [[PARAM_2_]] to sizes: [4], strides: [1], offsets: [0], shape: [0], order: [] : <f32> to tensor<4x!tt.ptr<f32>>
-// CHECK:           "tts.store"([[VAR_9_]], [[VAR_8_]]) <{static_mask_dims = array<i64>}> : (tensor<4x!tt.ptr<f32>>, tensor<4xf32>) -> ()
+// CHECK-DAG:       [[CST_0_i64:%.+]] = arith.constant 0 : i64
+// CHECK-DAG:       [[CST_1_i64:%.+]] = arith.constant 1 : i64
+// CHECK-DAG:       [[CST_0_i32:%.+]] = arith.constant 0 : i32
+// CHECK-DAG:       [[VAR_0_:%.+]] = tt.load [[PARAM_0_]] : !tt.ptr<i32>
+// CHECK-DAG:       [[VAR_1_:%.+]] = tt.make_tensor_ptr [[PARAM_1_]], {{\[}}[[CST_0_i64]]], {{\[}}[[CST_1_i64]]], {{\[}}[[VAR_0_]]] {{.*}} : <tensor<4xf32>>
+// CHECK-DAG:       [[VAR_2_:%.+]] = tt.load [[VAR_1_]] : !tt.ptr<tensor<4xf32>>
+// CHECK:           [[VAR_3_:%.+]] = tt.make_tensor_ptr [[PARAM_2_]], {{\[}}[[CST_0_i64]]], {{\[}}[[CST_1_i64]]], {{\[}}[[CST_0_i32]]] {{.*}} : <tensor<4xf32>>
+// CHECK:           tt.store [[VAR_3_]], [[VAR_2_]] : !tt.ptr<tensor<4xf32>>
 // CHECK:           tt.return
 // CHECK:         }
diff --git a/test/Triton/Intel/RaiseToBlockPointers/wraparound_unsupported_add_offset.mlir b/test/Triton/Intel/RaiseToBlockPointers/wraparound_unsupported_add_offset.mlir
@@ -1,5 +1,5 @@
 // RUN: triton-opt %s -triton-raise-block-pointer -canonicalize | FileCheck %s
-// XFAIL: *
+
 
 // We currently do not support this kind of modulo pattern:
 // (a + arrange(0, K)) % M
@@ -59,15 +59,15 @@ module {
 }
 
 // CHECK:         tt.func public @wrap_side_by_side_masked_loop_01234567([[arg0_:.+]]: !tt.ptr<f32>, [[arg1_:.+]]: !tt.ptr<f32>, [[arg2_:.+]]: i32, [[arg3_:.+]]: i32, [[arg4_:.+]]: i32, [[arg5_:.+]]: i32, [[arg6_:.+]]: i32, [[arg7_:.+]]: i32) {
-// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[CST_0_i64:%.+]] = arith.constant 0 : i64
 // CHECK-DAG:       [[VAR_cst_:%.+]] = arith.constant dense<-9.900000e+01> : tensor<4x4xf32>
-// CHECK-DAG:       [[CST_1_:%.+]] = arith.constant 1 : i32
-// CHECK-DAG:       [[CST_0_1_:%.+]] = arith.constant 0 : i32
-// CHECK-DAG:       [[CST_2_:%.+]] = arith.constant 2 : i32
+// CHECK-DAG:       [[CST_1_i32:%.+]] = arith.constant 1 : i32
+// CHECK-DAG:       [[CST_0_i32:%.+]] = arith.constant 0 : i32
+// CHECK-DAG:       [[CST_2_i32:%.+]] = arith.constant 2 : i32
 // CHECK-DAG:       [[VAR_cst_0_:%.+]] = arith.constant dense<2> : tensor<4x1xi32>
 // CHECK-DAG:       [[VAR_cst_1_:%.+]] = arith.constant dense<6> : tensor<4xi32>
 // CHECK-DAG:       [[VAR_cst_2_:%.+]] = arith.constant dense<2> : tensor<4xi32>
-// CHECK-DAG:       [[CST_4_:%.+]] = arith.constant 4 : i32
+// CHECK-DAG:       [[CST_4_i32:%.+]] = arith.constant 4 : i32
 // CHECK-DAG:       [[VAR_0_:%.+]] = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
 // CHECK-NOT: separator of consecutive DAGs
 // CHECK-DAG:       [[VAR_1_:%.+]] = arith.addi [[VAR_0_]], [[VAR_cst_2_]] : tensor<4xi32>
@@ -90,22 +90,27 @@ module {
 // CHECK-DAG:       [[VAR_15_:%.+]] = tt.addptr [[VAR_14_]], [[VAR_13_]] : tensor<4x4x!tt.ptr<f32>>, tensor<4x4xi32>
 // CHECK-DAG:       [[VAR_16_:%.+]] = tt.expand_dims [[VAR_0_]] {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32>
 // CHECK-DAG:       [[VAR_17_:%.+]] = arith.index_cast [[arg6_]] : i32 to index
-// CHECK-DAG:       [[VAR_18_:%.+]] = arith.index_cast [[arg7_]] : i32 to index
-// CHECK:           [[VAR_19_:%.+]] = arith.cmpi slt, [[VAR_16_]], [[VAR_cst_0_]] : tensor<4x1xi32>
-// CHECK-DAG:       [[VAR_20_:%.+]] = tt.broadcast [[VAR_19_]] : tensor<4x1xi1> -> tensor<4x4xi1>
-// CHECK-DAG:       [[VAR_21_:%.+]] = arith.muli [[arg4_]], [[CST_4_]] : i32
+// CHECK-DAG:       [[VAR_18_:%.+]] = arith.index_cast [[VAR_17_]] : index to i64
+// CHECK-DAG:       [[VAR_19_:%.+]] = arith.index_cast [[arg7_]] : i32 to index
+// CHECK-DAG:       [[VAR_20_:%.+]] = arith.index_cast [[VAR_19_]] : index to i64
+// CHECK-DAG:       [[VAR_21_:%.+]] = arith.trunci [[VAR_18_]] : i64 to i32
+// CHECK-DAG:       [[VAR_22_:%.+]] = arith.divui [[CST_0_i32]], [[VAR_21_]] : i32
+// CHECK-DAG:       [[VAR_23_:%.+]] = arith.trunci [[VAR_20_]] : i64 to i32
+// CHECK-DAG:       [[VAR_24_:%.+]] = arith.divui [[CST_0_i32]], [[VAR_23_]] : i32
+// CHECK:           [[VAR_25_:%.+]] = tt.make_tensor_ptr [[arg1_]], {{\[}}[[CST_0_i64]], [[CST_0_i64]]], {{\[}}[[VAR_18_]], [[VAR_20_]]], {{\[}}[[VAR_22_]], [[VAR_24_]]] {{.*}} : <tensor<4x4xf32>>
+// CHECK:           [[VAR_26_:%.+]] = arith.cmpi slt, [[VAR_16_]], [[VAR_cst_0_]] : tensor<4x1xi32>
+// CHECK-DAG:       [[VAR_27_:%.+]] = tt.broadcast [[VAR_26_]] : tensor<4x1xi1> -> tensor<4x4xi1>
+// CHECK-DAG:       [[VAR_28_:%.+]] = arith.muli [[arg4_]], [[CST_4_i32]] : i32
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:       [[VAR_22_:%.+]] = tt.splat [[VAR_21_]] : i32 -> tensor<4x4xi32>
-// CHECK-DAG:       [[VAR_23_:%.+]] = arith.muli [[arg5_]], [[CST_4_]] : i32
+// CHECK-DAG:       [[VAR_29_:%.+]] = tt.splat [[VAR_28_]] : i32 -> tensor<4x4xi32>
+// CHECK-DAG:       [[VAR_30_:%.+]] = arith.muli [[arg5_]], [[CST_4_i32]] : i32
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:       [[VAR_24_:%.+]] = arith.index_cast [[VAR_23_]] : i32 to index
-// CHECK-DAG:       [[VAR_25_:%.+]]:2 = scf.for [[VAR_arg8_:%.+]] = [[CST_0_1_]] to [[CST_2_]] step [[CST_1_]] iter_args([[VAR_arg9_:%.+]] = [[VAR_15_]], [[VAR_arg10_:%.+]] = [[CST_0_]]) -> (tensor<4x4x!tt.ptr<f32>>, index)  : i32 {
-// CHECK-DAG:         [[VAR_26_:%.+]] = tts.make_tptr [[arg1_]] to sizes: [4, 4], strides: {{.}}[[VAR_17_]], [[VAR_18_]]{{.}}, offsets: {{.}}[[VAR_arg10_]], [[CST_0_]]{{.}}, shape: [0, 0], order: [] : <f32> to tensor<4x4x!tt.ptr<f32>>
-// CHECK-DAG:         [[LOAD_VAR_arg9_MEM_:%.+]] = tt.load [[VAR_arg9_]], [[VAR_20_]], [[VAR_cst_]] : tensor<4x4x!tt.ptr<f32>>
-// CHECK:             "tts.store"([[VAR_26_]], [[LOAD_VAR_arg9_MEM_]]) <{static_mask_dims = array<i64>}> : (tensor<4x4x!tt.ptr<f32>>, tensor<4x4xf32>) -> ()
-// CHECK-DAG:         [[VAR_28_:%.+]] = tt.addptr [[VAR_arg9_]], [[VAR_22_]] : tensor<4x4x!tt.ptr<f32>>, tensor<4x4xi32>
-// CHECK-DAG:         [[VAR_29_:%.+]] = arith.addi [[VAR_arg10_]], [[VAR_24_]] : index
-// CHECK:             scf.yield [[VAR_28_]], [[VAR_29_]] : tensor<4x4x!tt.ptr<f32>>, index
+// CHECK-DAG:       [[VAR_31_:%.+]]:2 = scf.for [[VAR_arg8_:%.+]] = [[CST_0_i32]] to [[CST_2_i32]] step [[CST_1_i32]] iter_args([[VAR_arg9_:%.+]] = [[VAR_15_]], [[VAR_arg10_:%.+]] = [[VAR_25_]]) -> (tensor<4x4x!tt.ptr<f32>>, !tt.ptr<tensor<4x4xf32>>)
+// CHECK:             [[VAR_32_:%.+]] = tt.load [[VAR_arg9_]], [[VAR_27_]], [[VAR_cst_]] : tensor<4x4x!tt.ptr<f32>>
+// CHECK:             tt.store [[VAR_arg10_]], [[VAR_32_]] : !tt.ptr<tensor<4x4xf32>>
+// CHECK-DAG:         [[VAR_33_:%.+]] = tt.addptr [[VAR_arg9_]], [[VAR_29_]] : tensor<4x4x!tt.ptr<f32>>, tensor<4x4xi32>
+// CHECK-DAG:         [[VAR_34_:%.+]] = tt.advance [[VAR_arg10_]], {{\[}}[[CST_0_i32]], [[VAR_30_]]] : <tensor<4x4xf32>>
+// CHECK:             scf.yield [[VAR_33_]], [[VAR_34_]] : tensor<4x4x!tt.ptr<f32>>, !tt.ptr<tensor<4x4xf32>>
 // CHECK:           }
 // CHECK:           tt.return
 // CHECK:         }
diff --git a/third_party/intel/lib/TritonRaiseBlockPointer/TritonRaiseBlockPointer.cpp b/third_party/intel/lib/TritonRaiseBlockPointer/TritonRaiseBlockPointer.cpp