diff --git a/test/Analysis/intel/test-axis-info.mlir b/test/Analysis/intel/test-axis-info.mlir
index 1a3805f018..39dcd0bd3e 100644
--- a/test/Analysis/intel/test-axis-info.mlir
+++ b/test/Analysis/intel/test-axis-info.mlir
@@ -885,9 +885,11 @@ tt.func public @make_tensor_ptr(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f8E5M2> {tt.
   %c1_i64 = arith.constant 1 : i64
   %c32_i64 = arith.constant 32 : i64
   %c128_i64 = arith.constant 128 : i64
-  // CHECK: %0 = tt.make_tensor_ptr %arg0, {{.*}} => contiguity = [128, 32], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  // CHECK: tt.make_tensor_ptr %arg0, {{.*}} => contiguity = [128, 32], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
   %0 = tt.make_tensor_ptr %arg0, [%c128_i64, %c32_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : !tt.ptr<tensor<128x32xf16>>
-  // CHECK: %1 = tt.make_tensor_ptr %arg1, {{.*}} => contiguity = [32, 1], divisibility = [16, 1], constancy = [1, 1], constant_value = <none>
+  // CHECK: tt.make_tensor_ptr %arg1, {{.*}} => contiguity = [64, 1], divisibility = [16, 1], constancy = [1, 1], constant_value = <none>
   %1 = tt.make_tensor_ptr %arg1, [%c32_i64, %c32_i64], [%c1_i64, %arg2], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x16xf8E5M2>>
+  // CHECK: tt.make_tensor_ptr %arg1, {{.*}} => contiguity = [32, 64], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  %2 = tt.make_tensor_ptr %arg1, [%arg2, %c128_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf8E5M2>>
   tt.return
 }
diff --git a/test/TritonIntelGPU/coalesce.mlir b/test/TritonIntelGPU/coalesce.mlir
new file mode 100644
index 0000000000..d9b2de454c
--- /dev/null
+++ b/test/TritonIntelGPU/coalesce.mlir
@@ -0,0 +1,338 @@
+// RUN: triton-opt %s -split-input-file -tritonintelgpu-coalesce | FileCheck %s
+
+#blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1]}>
+#slice1dim1 = #triton_gpu.slice<{dim = 1, parent = #blocked1}>
+#slice2dim0 = #triton_gpu.slice<{dim = 0, parent = #blocked2}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+
+// CHECK: [[row_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK: [[col_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
+// CHECK: [[load_ptr:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64x!tt.ptr<f32>, [[row_layout]]>
+// CHECK: [[load_mask:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xi1, [[row_layout]]>
+// CHECK: [[load_other:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xf32, [[row_layout]]>
+// CHECK: [[load_val:%.*]] = tt.load [[load_ptr]], [[load_mask]], [[load_other]] : tensor<64x64x!tt.ptr<f32>, [[row_layout]]>
+// CHECK: [[store_ptr:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64x!tt.ptr<f32>, [[col_layout]]>
+// CHECK: [[store_val:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xf32, [[col_layout]]>
+// CHECK: [[store_mask:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xi1, [[col_layout]]>
+// CHECK: tt.store [[store_ptr]], [[store_val]], [[store_mask]]
+tt.func @transpose(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+                %arg1: i32 {tt.divisibility = 16 : i32},
+                %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+                %arg3: i32 {tt.divisibility = 16 : i32}) {
+  %cst = arith.constant dense<true> : tensor<64x64xi1, #blocked1>
+  %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1>
+  %00 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice1dim1>
+  %01 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice2dim0>
+  %1 = tt.expand_dims %00 {axis = 1 : i32} : tensor<64xi32, #slice1dim1> -> tensor<64x1xi32, #blocked1>
+  %2 = tt.splat %arg1 : i32 -> tensor<64x1xi32, #blocked1>
+  %3 = arith.muli %1, %2 : tensor<64x1xi32, #blocked1>
+  %4 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked1>
+  %5 = tt.addptr %4, %3 : tensor<64x1x!tt.ptr<f32>, #blocked1>, tensor<64x1xi32, #blocked1>
+  %6 = tt.expand_dims %01 {axis = 0 : i32} : tensor<64xi32, #slice2dim0> -> tensor<1x64xi32, #blocked2>
+  %7 = tt.broadcast %5 : tensor<64x1x!tt.ptr<f32>, #blocked1> -> tensor<64x64x!tt.ptr<f32>, #blocked1>
+  %8 = tt.broadcast %6 : tensor<1x64xi32, #blocked2> -> tensor<64x64xi32, #blocked2>
+  %9 = triton_gpu.convert_layout %8 : tensor<64x64xi32, #blocked2> -> tensor<64x64xi32, #blocked1>
+  %10 = tt.addptr %7, %9 : tensor<64x64x!tt.ptr<f32>, #blocked1>, tensor<64x64xi32, #blocked1>
+  %11 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked1>
+  %12 = tt.addptr %11, %1 : tensor<64x1x!tt.ptr<f32>, #blocked1>, tensor<64x1xi32, #blocked1>
+  %13 = tt.splat %arg3 : i32 -> tensor<1x64xi32, #blocked2>
+  %14 = arith.muli %6, %13 : tensor<1x64xi32, #blocked2>
+  %15 = tt.broadcast %12 : tensor<64x1x!tt.ptr<f32>, #blocked1> -> tensor<64x64x!tt.ptr<f32>, #blocked1>
+  %16 = tt.broadcast %14 : tensor<1x64xi32, #blocked2> -> tensor<64x64xi32, #blocked2>
+  %17 = triton_gpu.convert_layout %16 : tensor<64x64xi32, #blocked2> -> tensor<64x64xi32, #blocked1>
+  %18 = tt.addptr %15, %17 : tensor<64x64x!tt.ptr<f32>, #blocked1>, tensor<64x64xi32, #blocked1>
+  %19 = tt.load %10, %cst, %cst_0 : tensor<64x64x!tt.ptr<f32>, #blocked1>
+  tt.store %18, %19, %cst : tensor<64x64x!tt.ptr<f32>, #blocked1>
+  tt.return
+}
+
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+
+
+// CHECK: [[NARROW_LAYOUT:#.*]] = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+// CHECK: [[WIDE_LAYOUT:#.*]] = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+tt.func public @load_tensors_two_types(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+    %5 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked>
+    %6 = arith.cmpi "slt", %4, %5 : tensor<1024xi32, #blocked>
+    %7 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>, #blocked>
+    %10 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<1024x!tt.ptr<f16>, #blocked>
+    %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f16>, #blocked>, tensor<1024xi32, #blocked>
+    %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f16>, #blocked>
+    %13 = arith.extf %12 : tensor<1024xf16, #blocked> to tensor<1024xf32, #blocked>
+    %14 = arith.addf %9, %13 : tensor<1024xf32, #blocked>
+    %15 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %16 = tt.addptr %15, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    // CHECK: tt.store {{.*}} : tensor<1024x!tt.ptr<f32>, [[WIDE_LAYOUT]]>
+    tt.store %16, %14, %6 : tensor<1024x!tt.ptr<f32>, #blocked>
+    tt.return
+}
+
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+
+// CHECK-NOT: sizePerThread = [4]
+// CHECK: #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+// CHECK-NOT: sizePerThread = [4]
+tt.func public @load_tensors_two_types(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+    %5 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked>
+    %6 = arith.cmpi "slt", %4, %5 : tensor<1024xi32, #blocked>
+    %7 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>, #blocked>
+    %10 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<1024x!tt.ptr<f16>, #blocked>
+    %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f16>, #blocked>, tensor<1024xi32, #blocked>
+    %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f16>, #blocked>
+    %13 = arith.extf %12 : tensor<1024xf16, #blocked> to tensor<1024xf32, #blocked>
+    %14 = arith.addf %9, %13 : tensor<1024xf32, #blocked>
+    %15 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<1024x!tt.ptr<f16>, #blocked>
+    %16 = tt.addptr %15, %4 : tensor<1024x!tt.ptr<f16>, #blocked>, tensor<1024xi32, #blocked>
+    %17 = arith.truncf %14 : tensor<1024xf32, #blocked> to tensor<1024xf16, #blocked>
+    tt.store %16, %17, %6 : tensor<1024x!tt.ptr<f16>, #blocked>
+    tt.return
+}
+
+}
+
+// -----
+
+// COM: Reproducer for issue #3866
+// CHECK-LABEL: @test_3866
+// CHECK: tt.load {{.*}} : !tt.ptr<tensor<64x16xf16>
+module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func public @test_3866(%arg0: !tt.ptr<f16>, %arg1: i32, %arg2: i64) {
+    %0 = tt.make_tensor_ptr %arg0, [%arg2, %arg2], [%arg2, %arg2], [%arg1, %arg1] {order = array<i32: 1, 0>} : <tensor<64x16xf16>>
+    %1 = tt.load %0 : !tt.ptr<tensor<64x16xf16>>
+    tt.return
+  }
+}
+
+// -----
+
+// COM: Test coalescing on blocked pointers: coalescable load using block pointer in a SCF for loop.
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [4], order = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
+#dot1 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>
+#dot2 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  // CHECK: [[BLOCKED_LAYOUT1:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+  // CHECK: [[BLOCKED_LAYOUT2:#.*]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [4, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
+  // CHECK: @test_block_ptrs
+  tt.func public @test_block_ptrs(%arg0: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg3: f32, %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32, %arg10: i32, %arg11: i32 {tt.divisibility = 16 : i32}, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32 {tt.divisibility = 16 : i32}, %arg18: i32, %arg19: i32, %arg20: i32) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<8x16xf32, #dpas>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<8xf32, #blocked>
+    %cst_1 = arith.constant dense<0xFF800000> : tensor<8xf32, #blocked>
+    %c1_i32 = arith.constant 1 : i32
+    %c16_i32 = arith.constant 16 : i32
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<8x64xf32, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i64 = arith.constant 1 : i64
+    %c64_i64 = arith.constant 64 : i64
+    %c8_i32 = arith.constant 8 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = tt.get_program_id y : i32
+    %2 = arith.divsi %1, %arg19 : i32
+    %3 = arith.remsi %1, %arg19 : i32
+    %4 = arith.extsi %2 : i32 to i64
+    %5 = arith.extsi %arg6 : i32 to i64
+    %6 = arith.muli %4, %5 : i64
+    %7 = arith.extsi %3 : i32 to i64
+    %8 = arith.extsi %arg7 : i32 to i64
+    %9 = arith.muli %7, %8 : i64
+    %10 = arith.addi %6, %9 : i64
+    %11 = tt.addptr %arg0, %10 : !tt.ptr<f8E5M2>, i64
+    %12 = arith.muli %0, %c8_i32 : i32
+    %13 = arith.extsi %arg20 : i32 to i64
+    %14 = arith.extsi %arg8 : i32 to i64
+    // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]>
+    %15 = tt.make_tensor_ptr %11, [%13, %c64_i64], [%14, %c1_i64], [%12, %c0_i32] {order = array<i32: 1, 0>} : <tensor<8x64xf8E5M2, #dot1>>
+    %16 = tt.addptr %arg1, %10 : !tt.ptr<f8E5M2>, i64
+    %17 = arith.extsi %arg11 : i32 to i64
+    // CHECK: [[PTR2:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>
+    %18 = tt.make_tensor_ptr %16, [%c64_i64, %13], [%c1_i64, %17], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x16xf8E5M2, #dot2>>
+    %19 = tt.addptr %arg5, %10 : !tt.ptr<f8E5M2>, i64
+    %20 = arith.extsi %arg17 : i32 to i64
+    // CHECK: [[PTR3:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]>
+    %21 = tt.make_tensor_ptr %19, [%13, %c64_i64], [%20, %c1_i64], [%12, %c0_i32] {order = array<i32: 1, 0>} : <tensor<8x64xf8E5M2, #blocked1>>
+    %22 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked>
+    %23 = tt.splat %12 : i32 -> tensor<8xi32, #blocked>
+    %24 = arith.addi %23, %22 : tensor<8xi32, #blocked>
+    // CHECK: [[LOAD1:%.*]] = tt.load [[PTR1]] : !tt.ptr<tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]>
+    // CHECK-NEXT: triton_gpu.convert_layout [[LOAD1]] : tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]> -> tensor<8x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %25 = tt.load %15 : !tt.ptr<tensor<8x64xf8E5M2, #dot1>>
+    %26 = arith.addi %0, %c1_i32 : i32
+    %27 = arith.muli %26, %c8_i32 : i32
+    // CHECK: [[ADVANCE1:%.*]] = tt.advance [[PTR2]], {{.*}} : <tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>
+    %28 = tt.advance %18, [%c0_i32, %12] : <tensor<64x16xf8E5M2, #dot2>>
+    // CHECK: [[RES:%.*:2]] = scf.for {{.*}} iter_args(%arg22 = %cst_1, %arg23 = [[ADVANCE1]]) -> (tensor<8xf32, #blocked>, !tt.ptr<tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>)
+    %29:2 = scf.for %arg21 = %12 to %27 step %c16_i32 iter_args(%arg22 = %cst_1, %arg23 = %28) -> (tensor<8xf32, #blocked>, !tt.ptr<tensor<64x16xf8E5M2, #dot2>>)  : i32 {
+      // CHECK: [[LOAD2:%.*]] = tt.load %arg23 : !tt.ptr<tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>
+      // CHECK-NEXT: triton_gpu.convert_layout [[LOAD2]] : tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]> -> tensor<64x16xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %36 = tt.load %arg23 : !tt.ptr<tensor<64x16xf8E5M2, #dot2>>
+      %37 = tt.fp_to_fp %25 : tensor<8x64xf8E5M2, #dot1> -> tensor<8x64xf16, #dot1>
+      %38 = tt.fp_to_fp %36 : tensor<64x16xf8E5M2, #dot2> -> tensor<64x16xf16, #dot2>
+      %39 = tt.dot %37, %38, %cst, inputPrecision = tf32 : tensor<8x64xf16, #dot1> * tensor<64x16xf16, #dot2> -> tensor<8x16xf32, #dpas>
+      %40 = triton_gpu.convert_layout %39 : tensor<8x16xf32, #dpas> -> tensor<8x16xf32, #blocked2>
+      %41 = "tt.reduce"(%40) <{axis = 1 : i32}> ({
+      ^bb0(%arg24: f32, %arg25: f32):
+        %44 = arith.maxnumf %arg24, %arg25 : f32
+        tt.reduce.return %44 : f32
+      }) : (tensor<8x16xf32, #blocked2>) -> tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
+      %42 = triton_gpu.convert_layout %41 : tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<8xf32, #blocked>
+      // CHECK: [[ADVANCE2:%.*]] = tt.advance %arg23, {{.*}} : <tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>
+      // CHECK-NEXT: scf.yield {{.*}}, [[ADVANCE2]] : tensor<8xf32, #blocked>, !tt.ptr<tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>
+      %43 = tt.advance %arg23, [%c0_i32, %c16_i32] : <tensor<64x16xf8E5M2, #dot2>>
+      scf.yield %42, %43 : tensor<8xf32, #blocked>, !tt.ptr<tensor<64x16xf8E5M2, #dot2>>
+    } {tt.divisibility_arg1 = dense<16> : tensor<1xi32>}
+    %30 = arith.addf %29#0, %cst_0 : tensor<8xf32, #blocked>
+    %31 = arith.muli %1, %arg20 : i32
+    %32 = tt.addptr %arg4, %31 : !tt.ptr<f32>, i32
+    %33 = tt.splat %32 : !tt.ptr<f32> -> tensor<8x!tt.ptr<f32>, #blocked>
+    %34 = tt.addptr %33, %24 : tensor<8x!tt.ptr<f32>, #blocked>, tensor<8xi32, #blocked>
+    tt.store %34, %30 : tensor<8x!tt.ptr<f32>, #blocked>
+    %35 = tt.fp_to_fp %cst_2, rounding = rtne : tensor<8x64xf32, #blocked1> -> tensor<8x64xf8E5M2, #blocked1>
+    tt.store %21, %35 : !tt.ptr<tensor<8x64xf8E5M2, #blocked1>>
+    tt.return
+  }
+}
+
+// -----
+
+// COM: Test coalescing on blocked pointers: loop results used by another loop.
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [4], order = [0]}>
+#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
+#dot2 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+  // CHECK: [[BLOCKED_LAYOUT:#.*]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+  // CHECK: @test_block_ptrs
+  tt.func public @test_block_ptrs(%arg1: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32}, %arg19: i32) {
+    %cst_1 = arith.constant dense<1.000000e+00> : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>
+    %c32_i32 = arith.constant 32 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c64_i64 = arith.constant 64 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c0_i32 = arith.constant 0 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = tt.get_program_id y : i32
+    %2 = arith.divsi %1, %arg19 : i32
+    %3 = arith.remsi %1, %arg19 : i32
+    %4 = arith.extsi %2 : i32 to i64
+    %5 = arith.extsi %arg6 : i32 to i64
+    %6 = arith.muli %4, %5 : i64
+    %7 = arith.extsi %3 : i32 to i64
+    %8 = arith.extsi %arg7 : i32 to i64
+    %9 = arith.muli %7, %8 : i64
+    %10 = arith.addi %6, %9 : i64
+    %12 = arith.muli %0, %c64_i32 : i32
+    %13 = arith.extsi %arg19 : i32 to i64
+    %19 = tt.addptr %arg1, %10 : !tt.ptr<f8E5M2>, i64
+    %20 = arith.extsi %arg11 : i32 to i64
+    // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>
+    %21 = tt.make_tensor_ptr %19, [%c64_i64, %13], [%c1_i64, %20], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf8E5M2, #dot2>>
+    // CHECK: [[RES:%.*]]:2 = scf.for {{.*}} iter_args([[ARG1:%.*]] = %cst, [[ARG2:%.*]] = [[PTR1]]) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>)
+    %33:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args(%arg22 = %cst_1, %arg23 = %21) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr<tensor<64x32xf8E5M2, #dot2>>)  : i32 {
+      // CHECK: [[LOAD:%.*]] = tt.load [[ARG2]] : !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
+      // CHECK-NEXT: triton_gpu.convert_layout [[LOAD]] : tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]> -> tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      // CHECK-NEXT: scf.yield [[ARG1]], [[ARG2]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, #blocked>>
+      %load = tt.load %arg23 : !tt.ptr<tensor<64x32xf8E5M2, #dot2>>
+      scf.yield %arg22, %arg23 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr<tensor<64x32xf8E5M2, #dot2>>
+    }
+    // CHECK: scf.for {{.*}} iter_args([[ARG1:%.*]] = [[RES]]#0, [[ARG2:%.*]] = [[RES]]#1) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>)
+    %34:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args(%arg22 = %33#0, %arg23 = %33#1) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr<tensor<64x32xf8E5M2, #dot2>>) : i32 {
+      // CHECK: scf.yield [[ARG1]], [[ARG2]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
+      scf.yield %arg22, %arg23 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr<tensor<64x32xf8E5M2, #dot2>>
+    }
+    tt.return
+  }
+}
+
+// -----
+
+// COM: Test coalescing on blocked pointers: loop with 2 output blocked pointers.
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [4], order = [0]}>
+#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  // CHECK: [[BLOCKED_LAYOUT:#.*]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [4, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
+  // CHECK: @test_block_ptrs
+  tt.func public @test_block_ptrs(%arg0: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32, %arg11: i32 {tt.divisibility = 16 : i32}, %arg14: i32, %arg19: i32, %arg20: i32) {
+    %c32_i32 = arith.constant 32 : i32
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma>
+    %c64_i32 = arith.constant 64 : i32
+    %c64_i64 = arith.constant 64 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c0_i32 = arith.constant 0 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = tt.get_program_id y : i32
+    %2 = arith.divsi %1, %arg19 : i32
+    %3 = arith.remsi %1, %arg19 : i32
+    %4 = arith.extsi %2 : i32 to i64
+    %5 = arith.extsi %arg6 : i32 to i64
+    %6 = arith.muli %4, %5 : i64
+    %7 = arith.extsi %3 : i32 to i64
+    %8 = arith.extsi %arg7 : i32 to i64
+    %9 = arith.muli %7, %8 : i64
+    %10 = arith.addi %6, %9 : i64
+    %11 = tt.addptr %arg0, %10 : !tt.ptr<f8E5M2>, i64
+    %12 = arith.muli %0, %c64_i32 : i32
+    %13 = arith.extsi %arg20 : i32 to i64
+    %14 = arith.extsi %arg8 : i32 to i64
+    %15 = tt.make_tensor_ptr %11, [%13, %c64_i64], [%14, %c1_i64], [%12, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>>
+    %16 = tt.addptr %arg2, %10 : !tt.ptr<f8E5M2>, i64
+    %17 = arith.extsi %arg14 : i32 to i64
+    // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
+    %18 = tt.make_tensor_ptr %16, [%13, %c64_i64], [%c1_i64, %17], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
+    %19 = tt.addptr %arg1, %10 : !tt.ptr<f8E5M2>, i64
+    %20 = arith.extsi %arg11 : i32 to i64
+    // CHECK: [[PTR2:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>
+    %21 = tt.make_tensor_ptr %19, [%c64_i64, %13], [%c1_i64, %20], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
+    %32 = tt.load %15 : !tt.ptr<tensor<64x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>>
+    // CHECK: scf.for {{.*}} iter_args([[ARG1:%.*]] = [[PTR2]], [[ARG2:%.*]] = [[PTR1]]) -> (!tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>, !tt.ptr<tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
+    %35:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args(%arg25 = %21, %arg26 = %18) -> (!tt.ptr<tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)  : i32 {
+      // CHECK: [[LOAD:%.*]] = tt.load [[ARG1]] : !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
+      // CHECK-NEXT: triton_gpu.convert_layout [[LOAD]] : tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]> -> tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %58 = tt.load %arg25 : !tt.ptr<tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
+      %59 = tt.fp_to_fp %32 : tensor<64x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+      %60 = tt.fp_to_fp %58 : tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %61 = tt.dot %59, %60, %cst_2, inputPrecision = tf32 : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x32xf32, #mma>
+      // CHECK-DAG: [[ADVANCE1:%.*]] = tt.advance [[ARG1]], {{.*}} : <tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
+      // CHECK-DAG: [[ADVANCE2:%.*]] = tt.advance [[ARG2]], {{.*}} : <tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
+      // CHECK-NEXT: scf.yield [[ADVANCE1]], [[ADVANCE2]] : !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>, !tt.ptr<tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
+      %84 = tt.advance %arg26, [%c32_i32, %c0_i32] : <tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
+      %85 = tt.advance %arg25, [%c0_i32, %c32_i32] : <tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
+      scf.yield %85, %84 : !tt.ptr<tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
+    }
+    tt.return
+  }
+}
diff --git a/third_party/intel/include/Analysis/AxisInfo.h b/third_party/intel/include/Analysis/AxisInfo.h
index 3016e02cad..1fbaba2e0c 100644
--- a/third_party/intel/include/Analysis/AxisInfo.h
+++ b/third_party/intel/include/Analysis/AxisInfo.h
@@ -11,7 +11,6 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 #include <optional>
-#include <type_traits>
 
 namespace mlir::triton::intel {
 
diff --git a/third_party/intel/lib/Analysis/AxisInfo.cpp b/third_party/intel/lib/Analysis/AxisInfo.cpp
index 879b53e49f..378ba01442 100644
--- a/third_party/intel/lib/Analysis/AxisInfo.cpp
+++ b/third_party/intel/lib/Analysis/AxisInfo.cpp
@@ -1008,49 +1008,49 @@ class MakeTensorPtrOpAxisInfoVisitor final
               ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
     LDBG("MakeTensorPtrOpAxisInfoVisitor: " << *op);
 
-    // TODO: Extend to higher dimension tensor pointers.
-    if (op.getShape().size() != 2)
+    auto ptrTy = cast<PointerType>(op.getResult().getType());
+    auto tensorType = cast<RankedTensorType>(ptrTy.getPointeeType());
+    ArrayRef<int64_t> blkShape = tensorType.getShape();
+    unsigned rank = op.getShape().size();
+
+    // TODO: Support higher rank tensors.
+    if (rank > 2)
       return AxisInfo();
 
-    assert(operands.size() == 7 && "MakeTensorPtrOp should have 2D shape");
+    SmallVector<AxisInfo> strideInfo;
+    for (int i = rank + 1; i <= rank * 2; ++i)
+      strideInfo.emplace_back(operands[i]->getValue());
 
     AxisInfo ptrInfo = operands[0]->getValue();
-    AxisInfo shapeInfo0 = operands[1]->getValue();
-    AxisInfo shapeInfo1 = operands[2]->getValue();
-    AxisInfo strideInfo0 = operands[3]->getValue();
-    AxisInfo strideInfo1 = operands[4]->getValue();
-
-    std::optional<int64_t> shape0 = shapeInfo0.getConstantValue();
-    std::optional<int64_t> shape1 = shapeInfo1.getConstantValue();
-    std::optional<int64_t> stride0 = strideInfo0.getConstantValue();
-    std::optional<int64_t> stride1 = strideInfo1.getConstantValue();
-
-    AxisInfo::DimVectorT contiguity{
-        shape0.has_value() && (stride0 == 1) ? shape0.value() : 1,
-        shape1.has_value() && (stride1 == 1) ? shape1.value() : 1};
-
-    int64_t ptrDivisibility = ptrInfo.getDivisibility()[0];
-    int64_t strideDivisibility0 = strideInfo0.getDivisibility()[0];
-    int64_t strideDivisibility1 = strideInfo1.getDivisibility()[0];
-
-    LDBG("ptrDivisibility: " << ptrDivisibility);
-    LDBG("strideDivisibility0: " << strideDivisibility0);
-    LDBG("strideDivisibility1: " << strideDivisibility1);
-
-    AxisInfo::DimVectorT divisibility{1, 1};
-    if (ptrDivisibility > 1) {
-      if (contiguity[0] > 1)
-        divisibility[0] = std::min(ptrDivisibility, strideDivisibility1);
-      if (contiguity[1] > 1)
-        divisibility[1] = std::min(ptrDivisibility, strideDivisibility0);
+    int64_t ptrDivisibility = ptrInfo.getDivisibility(0);
+
+    AxisInfo::DimVectorT contiguity, constancy, divisibility;
+    for (int dim = 0; dim < rank; ++dim) {
+      contiguity.push_back(
+          strideInfo[dim].getConstantValue() == 1 ? blkShape[dim] : 1);
+      divisibility.push_back(
+          contiguity[dim] > 1
+              ? std::min(ptrDivisibility,
+                         strideInfo[dim == 0 ? 1 : 0].getDivisibility()[0])
+              : 1);
+      constancy.push_back(1);
     }
 
-    AxisInfo::DimVectorT constancy{1, 1};
-
     return AxisInfo(contiguity, divisibility, constancy);
   }
 };
 
+class AdvanceOpAxisInfoVisitor final
+    : public AxisInfoVisitorImpl<triton::AdvanceOp> {
+public:
+  using AxisInfoVisitorImpl<triton::AdvanceOp>::AxisInfoVisitorImpl;
+  AxisInfo
+  getAxisInfo(triton::AdvanceOp op,
+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
+    return operands[0]->getValue();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // AxisInfoAnalysis
 //===----------------------------------------------------------------------===//
@@ -1099,6 +1099,7 @@ AxisInfoAnalysis::AxisInfoAnalysis(DataFlowSolver &solver)
                   MaxMinOpAxisInfoVisitor<arith::MinUIOp>>();
   visitors.append<LoadOpAxisInfoVisitor>();
   visitors.append<MakeTensorPtrOpAxisInfoVisitor>();
+  visitors.append<AdvanceOpAxisInfoVisitor>();
 }
 
 LogicalResult AxisInfoAnalysis::visitOperation(
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp
index b213bc4438..7f52090f4e 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp
@@ -1,13 +1,16 @@
 #include "intel/include/Analysis/AxisInfo.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Utils.h"
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h"
-#include "mlir/Analysis/SliceAnalysis.h"
+#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Verifier.h"
 #include "mlir/Support/LLVM.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Tools/StrUtil.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "tritonintelgpu-coalesce"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
@@ -20,18 +23,20 @@ namespace mlir::triton::gpu::intel {
 
 using namespace mlir;
 namespace tt = mlir::triton;
+namespace ttg = mlir::triton::gpu;
 namespace ttgi = mlir::triton::gpu::intel;
 
 namespace {
 
 struct CoalescePass
     : public ttgi::impl::TritonIntelGPUCoalesceBase<CoalescePass> {
+private:
   void
   setCoalescedEncoding(tt::intel::ModuleAxisInfoAnalysis &axisInfoAnalysis,
                        Operation *op, int numWarps, int threadsPerWarp,
                        llvm::MapVector<Operation *, Attribute> &layoutMap) {
     Value ptr = getMemAccessPtr(op);
-    auto refTensorType = cast<RankedTensorType>(ptr.getType());
+    LDBG("ptr: " << ptr);
 
     LDBG("Considering op: " << *op);
     LLVM_DEBUG({
@@ -44,6 +49,7 @@ struct CoalescePass
     SmallVector<unsigned> order = argSort(contiguity);
     LDBG("order=[" << triton::join(order, ", ") << "]");
 
+    RankedTensorType refTensorType = ttgi::getRankedTensorType(ptr.getType());
     auto matchesShape = [&refTensorType](const Value &val) {
       auto rttType = dyn_cast<RankedTensorType>(val.getType());
       return rttType && rttType.getShape() == refTensorType.getShape();
@@ -67,12 +73,11 @@ struct CoalescePass
       }
     }
 
-    auto shapePerCTA = triton::gpu::getShapePerCTA(refTensorType);
+    auto shapePerCTA = ttg::getShapePerCTA(refTensorType);
     LDBG("shapePerCTA=[" << triton::join(shapePerCTA, ", ") << "]");
 
     int numElems = product<int64_t>(shapePerCTA);
     int numThreads = numWarps * threadsPerWarp;
-
     unsigned perThread =
         ttgi::getNumElementsPerThread(op, order, axisInfoAnalysis);
     LDBG("perThread for op: " << perThread);
@@ -102,33 +107,207 @@ struct CoalescePass
     SmallVector<unsigned> sizePerThread(refTensorType.getRank(), 1);
     sizePerThread[order[0]] = perThread;
 
-    auto CTALayout = triton::gpu::getCTALayout(refTensorType.getEncoding());
-    layoutMap[op] = triton::gpu::BlockedEncodingAttr::get(
+    auto CTALayout = ttg::getCTALayout(refTensorType.getEncoding());
+    layoutMap[op] = ttg::BlockedEncodingAttr::get(
         &getContext(), refTensorType.getShape(), sizePerThread, order, numWarps,
         threadsPerWarp, CTALayout);
   }
 
-  static Type getNewType(Type type, Attribute encoding) {
-    RankedTensorType tensorType = cast<RankedTensorType>(type);
+  static RankedTensorType getNewType(RankedTensorType tensorType,
+                                     Attribute encoding) {
     return RankedTensorType::get(tensorType.getShape(),
                                  tensorType.getElementType(), encoding);
   }
 
+  // Find the defining makeTensorPtrOp operation of the given value.
+  static std::optional<tt::MakeTensorPtrOp>
+  findDefiningMakeTensorPtrOp(Value val) {
+    if (auto arg = dyn_cast<BlockArgument>(val)) {
+      Operation *parentOp = val.getParentBlock()->getParentOp();
+      assert(isa<scf::ForOp>(parentOp) && "Expected a scf::ForOp");
+      auto loopArg =
+          cast<scf::ForOp>(parentOp).getInitArgs()[arg.getArgNumber() - 1];
+      return findDefiningMakeTensorPtrOp(loopArg);
+    }
+
+    if (auto advanceOp = val.getDefiningOp<tt::AdvanceOp>())
+      return findDefiningMakeTensorPtrOp(advanceOp.getPtr());
+    if (auto makePtrOp = val.getDefiningOp<tt::MakeTensorPtrOp>())
+      return makePtrOp;
+
+    return std::nullopt;
+  }
+
+  static bool filterUser(Operation *op) {
+    // Yield operations trigger updating the layout of the containing loop
+    // results, don't skip them.
+    if (isa<scf::YieldOp>(op))
+      return false;
+
+    // Skip operations that don't yield a result and contain no regions.
+    if (op->getNumResults() == 0 && op->getNumRegions() == 0)
+      return true;
+
+    // Operations that do not yield a block pointer aren't interesting.
+    if (op->getNumRegions() == 0 &&
+        llvm::none_of(op->getResultTypes(), [](Type resType) {
+          return tt::isTensorPointerType(resType);
+        }))
+      return true;
+
+    return false;
+  }
+
+  // Change the \p layout of the \p op result and propagate the new result type
+  // to its users.
+  void changeAndPropagateLayout(Operation *op, Attribute layout,
+                                IRRewriter &rewriter) const {
+    assert(op && op->getNumResults() == 1 &&
+           "Expecting operation yielding a result");
+
+    rewriter.modifyOpInPlace(op, [&]() {
+      Value res = op->getOpResult(0);
+      assert(tt::isTensorPointerType(res.getType()) &&
+             "Expecting a block pointer");
+
+      auto ptrType = cast<tt::PointerType>(res.getType());
+      auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
+      res.setType(tt::PointerType::get(getNewType(tensorType, layout),
+                                       ptrType.getAddressSpace()));
+    });
+    LDBG("Coalesced op: " << *op);
+
+    propagateLayout(op, layout, rewriter);
+  }
+
+  // Propagate the layout of the \p root operation's result to its users.
+  void propagateLayout(Operation *root, Attribute layout,
+                       IRRewriter &rewriter) const {
+    assert(root->getNumResults() != 0 &&
+           "Expecting an operation yielding a result");
+
+    LDBG("root: " << *root);
+    for (Operation *user : root->getUsers()) {
+      if (filterUser(user))
+        continue;
+
+      LDBG("root's user: " << *user << "\n");
+      if (auto forOp = dyn_cast<scf::ForOp>(user)) {
+        propagateLayoutToArgsAndBody(forOp, root, layout, rewriter);
+        continue;
+      }
+      if (auto yieldOp = dyn_cast<scf::YieldOp>(user)) {
+        auto forOp = yieldOp->getParentOfType<scf::ForOp>();
+        propagateLayoutToLoopResults(forOp, layout, rewriter);
+        continue;
+      }
+      changeAndPropagateLayout(user, layout, rewriter);
+    }
+  }
+
+  // Propagate the layout of the \p arg block argument to its users.
+  void propagateLayout(BlockArgument arg, Attribute layout,
+                       IRRewriter &rewriter) const {
+    LDBG("arg: " << arg);
+    for (Operation *user : arg.getUsers()) {
+      if (filterUser(user))
+        continue;
+
+      LDBG("arg's user: " << *user << "\n");
+      if (auto yieldOp = dyn_cast<scf::YieldOp>(user)) {
+        auto forOp = yieldOp->getParentOfType<scf::ForOp>();
+        propagateLayoutToLoopResults(forOp, layout, rewriter);
+        continue;
+      }
+      changeAndPropagateLayout(user, layout, rewriter);
+    }
+  }
+
+  // Propagate the layout of the \p root operation's result to the \p forOp loop
+  // init argument that uses it, and transitively to the operations in the loop
+  // body that use that argument.
+  void propagateLayoutToArgsAndBody(scf::ForOp forOp, Operation *root,
+                                    Attribute layout,
+                                    IRRewriter &rewriter) const {
+    assert(llvm::any_of(root->getUsers(),
+                        [&](Operation *user) { return user == forOp; }) &&
+           "Expecting the loop to be a user of the root operation");
+
+    for (BlockArgument arg : forOp.getRegionIterArgs()) {
+      Value loopArg = forOp.getInitArgs()[arg.getArgNumber() - 1];
+      for (OpResult res : root->getResults()) {
+        if (res != loopArg || !tt::isTensorPointerType(res.getType()))
+          continue;
+
+        LDBG("loopArg: " << loopArg);
+
+        // Modify the layout of the loop init argument...
+        tt::PointerType ptrType = cast<tt::PointerType>(arg.getType());
+        auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
+        arg.setType(tt::PointerType::get(getNewType(tensorType, layout),
+                                         ptrType.getAddressSpace()));
+
+        // ... and then propagate it to the operations in the loop.
+        propagateLayout(arg, layout, rewriter);
+      }
+    }
+  }
+
+  // Modify the given loop \p forOp and propagate the result of the enclosing
+  // loop.
+  void propagateLayoutToLoopResults(scf::ForOp forOp, Attribute layout,
+                                    IRRewriter &rewriter) const {
+    Operation *yieldOp = forOp.getBody()->getTerminator();
+
+    rewriter.modifyOpInPlace(forOp, [&]() {
+      for (auto [opType, res] :
+           llvm::zip(yieldOp->getOperandTypes(), forOp.getResults())) {
+        if (opType == res.getType())
+          continue;
+
+        assert(tt::isTensorPointerType(res.getType()) &&
+               tt::isTensorPointerType(opType) && "Expecting blocked pointers");
+        assert(cast<RankedTensorType>(
+                   cast<tt::PointerType>(opType).getPointeeType())
+                       .getEncoding() == layout &&
+               "Unexpected layout");
+
+        auto resType = cast<tt::PointerType>(res.getType());
+        RankedTensorType tensorType = ttgi::getRankedTensorType(resType);
+        res.setType(tt::PointerType::get(getNewType(tensorType, layout),
+                                         resType.getAddressSpace()));
+      }
+    });
+
+    propagateLayout(forOp, layout, rewriter);
+  }
+
   void coalesceOp(Attribute encoding, Operation *op) {
+    LDBG("Coalescing op: " << *op);
+
     OpBuilder builder(op);
+
     // Convert operands
-    // For load/store with tensor pointers, we don't have to change the
-    // operands' type, we do this by changing the outputs' type of
-    // `make_tensor_ptr`
+    // Note: for load/store with a blocked pointers argument we cannot change
+    // the operand type, instead we change the output type of
+    // `make_tensor_ptr` and propagate the new output type along the def-use
+    // chain.
     SmallVector<Value, 4> newArgs;
-    for (auto operand : op->getOperands()) {
+    for (Value operand : op->getOperands()) {
       auto tensorType = dyn_cast<RankedTensorType>(operand.getType());
       if (tensorType &&
-          !isa<triton::gpu::SharedEncodingAttr>(tensorType.getEncoding())) {
-        Type newType = getNewType(tensorType, encoding);
-        newArgs.push_back(builder.create<triton::gpu::ConvertLayoutOp>(
+          !isa<ttg::SharedEncodingAttr>(tensorType.getEncoding())) {
+        RankedTensorType newType = getNewType(tensorType, encoding);
+        newArgs.push_back(builder.create<ttg::ConvertLayoutOp>(
             op->getLoc(), newType, operand));
       } else {
+        assert(isa<tt::PointerType>(operand.getType()) &&
+               "Expecting operand to have blocked pointer type");
+        auto defOp = findDefiningMakeTensorPtrOp(operand);
+        assert(defOp && "Expected a make_tensor_ptr operation");
+        LDBG("Found make_tensor_ptr definition: " << *defOp);
+        IRRewriter rewriter(builder);
+        changeAndPropagateLayout(*defOp, encoding, rewriter);
         newArgs.push_back(operand);
       }
     }
@@ -136,27 +315,34 @@ struct CoalescePass
     // Convert output types
     SmallVector<Type, 4> newTypes;
     for (auto t : op->getResultTypes()) {
-      bool isAsync = isa<triton::gpu::AsyncCopyGlobalToLocalOp>(op);
-      newTypes.push_back(isAsync ? t : getNewType(t, encoding));
+      assert(!isa<ttg::AsyncCopyGlobalToLocalOp>(op) &&
+             "AsyncCopyGlobalToLocalOp not supported for Intel GPU");
+      newTypes.push_back(getNewType(cast<RankedTensorType>(t), encoding));
     }
 
-    // Construct new op with the new encoding
+    // Construct new op with the new encoding.
     Operation *newOp =
         builder.create(op->getLoc(), op->getName().getIdentifier(), newArgs,
                        newTypes, op->getAttrs());
 
-    // Cast the results back to the original layout
+    // Cast the results back to the original layout.
     for (size_t i = 0; i < op->getNumResults(); i++) {
       Value newResult = newOp->getResult(i);
       if (newTypes[i] != op->getResultTypes()[i]) {
-        newResult = builder.create<triton::gpu::ConvertLayoutOp>(
+        newResult = builder.create<ttg::ConvertLayoutOp>(
             op->getLoc(), op->getResult(i).getType(), newResult);
       }
       op->getResult(i).replaceAllUsesWith(newResult);
     }
+
+    LDBG("Old op: " << *op);
+    LDBG("newOp: " << *newOp);
     op->erase();
+
+    assert(succeeded(verify(newOp)) && "Operation verification failed");
   }
 
+public:
   void runOnOperation() override {
     // Run axis info analysis
     ModuleOp moduleOp = getOperation();
@@ -169,20 +355,27 @@ struct CoalescePass
       Value ptr = getMemAccessPtr(curr);
       if (!ptr)
         return;
-      // We only convert `tensor<tt.ptr<>>` load/store
-      bool isPtrTensor = false;
-      if (auto tensorType = dyn_cast<RankedTensorType>(ptr.getType()))
-        isPtrTensor = isa<tt::PointerType>(tensorType.getElementType());
-      if (!isPtrTensor)
+
+      RankedTensorType refTensorType = ttgi::getRankedTensorType(ptr.getType());
+      if (!refTensorType || !refTensorType.getEncoding())
         return;
-      auto mod = curr->getParentOfType<ModuleOp>();
-      int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod);
-      int threadsPerWarp =
-          triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod);
+
+      int numWarps = ttg::TritonGPUDialect::getNumWarps(moduleOp);
+      int threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(moduleOp);
       setCoalescedEncoding(axisInfoAnalysis, curr, numWarps, threadsPerWarp,
                            layoutMap);
     });
 
+    LLVM_DEBUG({
+      DBGS() << "\nlayoutMap:"
+             << "\n";
+      for (auto [op, encoding] : layoutMap) {
+        DBGS() << "op: " << *op << "\n";
+        DBGS() << "encoding: " << encoding << "\n\n";
+      }
+      llvm::errs() << "\n\n";
+    });
+
     // For each memory op that has a layout L1:
     // 1. Create a coalesced memory layout L2 of the pointer operands
     // 2. Convert all operands from layout L1 to layout L2
@@ -190,9 +383,11 @@ struct CoalescePass
     //    produces a tensor with layout L2
     // 4. Convert the output of this new memory op back to L1
     // 5. Replace all the uses of the original memory op by the new one
-    for (auto &kv : layoutMap) {
-      coalesceOp(kv.second, kv.first);
+    for (auto [op, layout] : layoutMap) {
+      coalesceOp(layout, op);
     }
+
+    assert(succeeded(verify(moduleOp)) && "Module verification failed");
   }
 };