[RemoveLayoutConversion] Increase convert layout cost (#5477)

whitneywhtsang · web-flow · commit 84f204b507db · 2025-11-14T17:51:21.000Z
The cost of smem load/store and synchronization are higher on Intel GPUs compared to NV. This PR simply increases it by a factor of 2. Issue #5476 is created to further refine the remove layout conversion cost model. Benchmark CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/19356260840 (good) Fixes #5124 --------- Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/test/TritonIntelGPU/remove_layout_conversions_5124.mlir b/test/TritonIntelGPU/remove_layout_conversions_5124.mlir
@@ -0,0 +1,248 @@
+// RUN: triton-opt %s -tritonintelgpu-remove-layout-conversions 2>&1 | FileCheck %s
+
+// CHECK-NOT: ttg.convert_layout
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.support_sg_2d_block, ttig.target_arch = "spir64"} {
+  tt.func public @triton_poi_fused_max_pool2d_with_indices_max_pool2d_with_indices_backward_139(%in_ptr0: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %in_ptr1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %out_ptr0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %xnumel: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
+    %cst_0 = arith.constant dense<294> : tensor<1024xi32, #blocked>
+    %cst_1 = arith.constant dense<144> : tensor<1024xi32, #blocked>
+    %cst_2 = arith.constant dense<3> : tensor<1024xi32, #blocked>
+    %cst_3 = arith.constant dense<9> : tensor<1024xi32, #blocked>
+    %cst_4 = arith.constant dense<341056> : tensor<1024xi32, #blocked>
+    %cst_5 = arith.constant dense<4672> : tensor<1024xi32, #blocked>
+    %cst_6 = arith.constant dense<73> : tensor<1024xi32, #blocked>
+    %cst_7 = arith.constant dense<1> : tensor<1024xi32, #blocked>
+    %cst_8 = arith.constant dense<2> : tensor<1024xi32, #blocked>
+    %cst_9 = arith.constant dense<0> : tensor<1024xi32, #blocked>
+    %cst_10 = arith.constant dense<-1> : tensor<1024xi32, #blocked>
+    %cst_11 = arith.constant dense<21609> : tensor<1024xi32, #blocked>
+    %cst_12 = arith.constant dense<1382976> : tensor<1024xi32, #blocked>
+    %cst_13 = arith.constant dense<9408> : tensor<1024xi32, #blocked>
+    %cst_14 = arith.constant dense<147> : tensor<1024xi32, #blocked>
+    %cst_15 = arith.constant dense<64> : tensor<1024xi32, #blocked>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+    %5 = arith.remsi %4, %cst_15 : tensor<1024xi32, #blocked>
+    %6 = arith.divsi %4, %cst_15 : tensor<1024xi32, #blocked>
+    %7 = arith.remsi %6, %cst_14 : tensor<1024xi32, #blocked>
+    %8 = arith.divsi %4, %cst_13 : tensor<1024xi32, #blocked>
+    %9 = arith.remsi %8, %cst_14 : tensor<1024xi32, #blocked>
+    %10 = arith.divsi %4, %cst_12 : tensor<1024xi32, #blocked>
+    %11 = arith.remsi %6, %cst_11 : tensor<1024xi32, #blocked>
+    %12 = arith.addi %7, %cst_10 : tensor<1024xi32, #blocked>
+    %13 = arith.divsi %12, %cst_8 : tensor<1024xi32, #blocked>
+    %14 = arith.remsi %12, %cst_8 : tensor<1024xi32, #blocked>
+    %15 = arith.cmpi ne, %14, %cst_9 : tensor<1024xi32, #blocked>
+    %16 = arith.subi %13, %cst_7 : tensor<1024xi32, #blocked>
+    %17 = arith.select %15, %16, %13 : tensor<1024xi1, #blocked>, tensor<1024xi32, #blocked>
+    %18 = arith.cmpi slt, %12, %cst_9 : tensor<1024xi32, #blocked>
+    %19 = arith.select %18, %17, %13 : tensor<1024xi1, #blocked>, tensor<1024xi32, #blocked>
+    %20 = arith.cmpi sgt, %19, %cst_9 : tensor<1024xi32, #blocked>
+    %21 = arith.extui %20 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %22 = arith.muli %19, %21 : tensor<1024xi32, #blocked>
+    %23 = arith.divsi %7, %cst_8 : tensor<1024xi32, #blocked>
+    %24 = arith.addi %23, %cst_7 : tensor<1024xi32, #blocked>
+    %25 = arith.cmpi sge, %24, %cst_6 : tensor<1024xi32, #blocked>
+    %26 = arith.extui %25 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %27 = arith.muli %26, %cst_6 : tensor<1024xi32, #blocked>
+    %28 = arith.cmpi slt, %24, %cst_6 : tensor<1024xi32, #blocked>
+    %29 = arith.extui %28 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %30 = arith.muli %24, %29 : tensor<1024xi32, #blocked>
+    %31 = arith.addi %27, %30 : tensor<1024xi32, #blocked>
+    %32 = arith.addi %31, %cst_10 : tensor<1024xi32, #blocked>
+    %33 = arith.cmpi sle, %22, %32 : tensor<1024xi32, #blocked>
+    %34 = arith.extui %33 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %35 = arith.muli %22, %34 : tensor<1024xi32, #blocked>
+    %36 = arith.cmpi slt, %32, %22 : tensor<1024xi32, #blocked>
+    %37 = arith.extui %36 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %38 = arith.muli %32, %37 : tensor<1024xi32, #blocked>
+    %39 = arith.addi %35, %38 : tensor<1024xi32, #blocked>
+    %40 = arith.muli %39, %cst_15 : tensor<1024xi32, #blocked>
+    %41 = arith.addi %5, %40 : tensor<1024xi32, #blocked>
+    %42 = arith.addi %9, %cst_10 : tensor<1024xi32, #blocked>
+    %43 = arith.divsi %42, %cst_8 : tensor<1024xi32, #blocked>
+    %44 = arith.remsi %42, %cst_8 : tensor<1024xi32, #blocked>
+    %45 = arith.cmpi ne, %44, %cst_9 : tensor<1024xi32, #blocked>
+    %46 = arith.subi %43, %cst_7 : tensor<1024xi32, #blocked>
+    %47 = arith.select %45, %46, %43 : tensor<1024xi1, #blocked>, tensor<1024xi32, #blocked>
+    %48 = arith.cmpi slt, %42, %cst_9 : tensor<1024xi32, #blocked>
+    %49 = arith.select %48, %47, %43 : tensor<1024xi1, #blocked>, tensor<1024xi32, #blocked>
+    %50 = arith.cmpi sgt, %49, %cst_9 : tensor<1024xi32, #blocked>
+    %51 = arith.extui %50 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %52 = arith.muli %49, %51 : tensor<1024xi32, #blocked>
+    %53 = arith.divsi %9, %cst_8 : tensor<1024xi32, #blocked>
+    %54 = arith.addi %53, %cst_7 : tensor<1024xi32, #blocked>
+    %55 = arith.cmpi sge, %54, %cst_6 : tensor<1024xi32, #blocked>
+    %56 = arith.extui %55 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %57 = arith.muli %56, %cst_6 : tensor<1024xi32, #blocked>
+    %58 = arith.cmpi slt, %54, %cst_6 : tensor<1024xi32, #blocked>
+    %59 = arith.extui %58 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %60 = arith.muli %54, %59 : tensor<1024xi32, #blocked>
+    %61 = arith.addi %57, %60 : tensor<1024xi32, #blocked>
+    %62 = arith.addi %61, %cst_10 : tensor<1024xi32, #blocked>
+    %63 = arith.cmpi sle, %52, %62 : tensor<1024xi32, #blocked>
+    %64 = arith.extui %63 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %65 = arith.muli %52, %64 : tensor<1024xi32, #blocked>
+    %66 = arith.cmpi slt, %62, %52 : tensor<1024xi32, #blocked>
+    %67 = arith.extui %66 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %68 = arith.muli %62, %67 : tensor<1024xi32, #blocked>
+    %69 = arith.addi %65, %68 : tensor<1024xi32, #blocked>
+    %70 = arith.muli %69, %cst_5 : tensor<1024xi32, #blocked>
+    %71 = arith.addi %41, %70 : tensor<1024xi32, #blocked>
+    %72 = arith.muli %10, %cst_4 : tensor<1024xi32, #blocked>
+    %73 = arith.addi %71, %72 : tensor<1024xi32, #blocked>
+    %74 = tt.splat %in_ptr0 : !tt.ptr<i8> -> tensor<1024x!tt.ptr<i8>, #blocked>
+    %75 = tt.addptr %74, %73 : tensor<1024x!tt.ptr<i8>, #blocked>, tensor<1024xi32, #blocked>
+    %76 = ttg.convert_layout %75 : tensor<1024x!tt.ptr<i8>, #blocked> -> tensor<1024x!tt.ptr<i8>, #blocked1>
+    %77 = tt.load %76 : tensor<1024x!tt.ptr<i8>, #blocked1>
+    %78 = ttg.convert_layout %77 : tensor<1024xi8, #blocked1> -> tensor<1024xi8, #blocked>
+    %79 = tt.splat %in_ptr1 : !tt.ptr<f16> -> tensor<1024x!tt.ptr<f16>, #blocked>
+    %80 = tt.addptr %79, %73 : tensor<1024x!tt.ptr<f16>, #blocked>, tensor<1024xi32, #blocked>
+    %81 = ttg.convert_layout %80 : tensor<1024x!tt.ptr<f16>, #blocked> -> tensor<1024x!tt.ptr<f16>, #blocked1>
+    %82 = tt.load %81 : tensor<1024x!tt.ptr<f16>, #blocked1>
+    %83 = ttg.convert_layout %82 : tensor<1024xf16, #blocked1> -> tensor<1024xf16, #blocked>
+    %84 = arith.extf %83 : tensor<1024xf16, #blocked> to tensor<1024xf32, #blocked>
+    %85 = arith.addi %22, %cst_7 : tensor<1024xi32, #blocked>
+    %86 = arith.cmpi sle, %85, %32 : tensor<1024xi32, #blocked>
+    %87 = arith.extui %86 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %88 = arith.muli %85, %87 : tensor<1024xi32, #blocked>
+    %89 = arith.cmpi slt, %32, %85 : tensor<1024xi32, #blocked>
+    %90 = arith.extui %89 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %91 = arith.muli %32, %90 : tensor<1024xi32, #blocked>
+    %92 = arith.addi %88, %91 : tensor<1024xi32, #blocked>
+    %93 = arith.muli %92, %cst_15 : tensor<1024xi32, #blocked>
+    %94 = arith.addi %5, %93 : tensor<1024xi32, #blocked>
+    %95 = arith.addi %94, %70 : tensor<1024xi32, #blocked>
+    %96 = arith.addi %95, %72 : tensor<1024xi32, #blocked>
+    %97 = tt.addptr %74, %96 : tensor<1024x!tt.ptr<i8>, #blocked>, tensor<1024xi32, #blocked>
+    %98 = ttg.convert_layout %97 : tensor<1024x!tt.ptr<i8>, #blocked> -> tensor<1024x!tt.ptr<i8>, #blocked1>
+    %99 = tt.load %98 : tensor<1024x!tt.ptr<i8>, #blocked1>
+    %100 = ttg.convert_layout %99 : tensor<1024xi8, #blocked1> -> tensor<1024xi8, #blocked>
+    %101 = tt.addptr %79, %96 : tensor<1024x!tt.ptr<f16>, #blocked>, tensor<1024xi32, #blocked>
+    %102 = ttg.convert_layout %101 : tensor<1024x!tt.ptr<f16>, #blocked> -> tensor<1024x!tt.ptr<f16>, #blocked1>
+    %103 = tt.load %102 : tensor<1024x!tt.ptr<f16>, #blocked1>
+    %104 = ttg.convert_layout %103 : tensor<1024xf16, #blocked1> -> tensor<1024xf16, #blocked>
+    %105 = arith.extf %104 : tensor<1024xf16, #blocked> to tensor<1024xf32, #blocked>
+    %106 = arith.addi %52, %cst_7 : tensor<1024xi32, #blocked>
+    %107 = arith.cmpi sle, %106, %62 : tensor<1024xi32, #blocked>
+    %108 = arith.extui %107 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %109 = arith.muli %106, %108 : tensor<1024xi32, #blocked>
+    %110 = arith.cmpi slt, %62, %106 : tensor<1024xi32, #blocked>
+    %111 = arith.extui %110 : tensor<1024xi1, #blocked> to tensor<1024xi32, #blocked>
+    %112 = arith.muli %62, %111 : tensor<1024xi32, #blocked>
+    %113 = arith.addi %109, %112 : tensor<1024xi32, #blocked>
+    %114 = arith.muli %113, %cst_5 : tensor<1024xi32, #blocked>
+    %115 = arith.addi %41, %114 : tensor<1024xi32, #blocked>
+    %116 = arith.addi %115, %72 : tensor<1024xi32, #blocked>
+    %117 = tt.addptr %74, %116 : tensor<1024x!tt.ptr<i8>, #blocked>, tensor<1024xi32, #blocked>
+    %118 = ttg.convert_layout %117 : tensor<1024x!tt.ptr<i8>, #blocked> -> tensor<1024x!tt.ptr<i8>, #blocked1>
+    %119 = tt.load %118 : tensor<1024x!tt.ptr<i8>, #blocked1>
+    %120 = ttg.convert_layout %119 : tensor<1024xi8, #blocked1> -> tensor<1024xi8, #blocked>
+    %121 = tt.addptr %79, %116 : tensor<1024x!tt.ptr<f16>, #blocked>, tensor<1024xi32, #blocked>
+    %122 = ttg.convert_layout %121 : tensor<1024x!tt.ptr<f16>, #blocked> -> tensor<1024x!tt.ptr<f16>, #blocked1>
+    %123 = tt.load %122 : tensor<1024x!tt.ptr<f16>, #blocked1>
+    %124 = ttg.convert_layout %123 : tensor<1024xf16, #blocked1> -> tensor<1024xf16, #blocked>
+    %125 = arith.extf %124 : tensor<1024xf16, #blocked> to tensor<1024xf32, #blocked>
+    %126 = arith.addi %94, %114 : tensor<1024xi32, #blocked>
+    %127 = arith.addi %126, %72 : tensor<1024xi32, #blocked>
+    %128 = tt.addptr %74, %127 : tensor<1024x!tt.ptr<i8>, #blocked>, tensor<1024xi32, #blocked>
+    %129 = ttg.convert_layout %128 : tensor<1024x!tt.ptr<i8>, #blocked> -> tensor<1024x!tt.ptr<i8>, #blocked1>
+    %130 = tt.load %129 : tensor<1024x!tt.ptr<i8>, #blocked1>
+    %131 = ttg.convert_layout %130 : tensor<1024xi8, #blocked1> -> tensor<1024xi8, #blocked>
+    %132 = tt.addptr %79, %127 : tensor<1024x!tt.ptr<f16>, #blocked>, tensor<1024xi32, #blocked>
+    %133 = ttg.convert_layout %132 : tensor<1024x!tt.ptr<f16>, #blocked> -> tensor<1024x!tt.ptr<f16>, #blocked1>
+    %134 = tt.load %133 : tensor<1024x!tt.ptr<f16>, #blocked1>
+    %135 = ttg.convert_layout %134 : tensor<1024xf16, #blocked1> -> tensor<1024xf16, #blocked>
+    %136 = arith.extf %135 : tensor<1024xf16, #blocked> to tensor<1024xf32, #blocked>
+    %137 = arith.extsi %78 : tensor<1024xi8, #blocked> to tensor<1024xi32, #blocked>
+    %138 = arith.addi %137, %cst_3 : tensor<1024xi32, #blocked>
+    %139 = arith.cmpi slt, %137, %cst_9 : tensor<1024xi32, #blocked>
+    %140 = arith.select %139, %138, %137 : tensor<1024xi1, #blocked>, tensor<1024xi32, #blocked>
+    %141 = arith.cmpi sge, %140, %cst_9 : tensor<1024xi32, #blocked>
+    %142 = arith.cmpi slt, %140, %cst_3 : tensor<1024xi32, #blocked>
+    %143 = arith.andi %141, %142 : tensor<1024xi1, #blocked>
+    tt.assert %143, "index out of bounds: 0 <= tmp4 < 9" : tensor<1024xi1, #blocked>
+    %144 = arith.muli %39, %cst_8 : tensor<1024xi32, #blocked>
+    %145 = arith.addi %140, %144 : tensor<1024xi32, #blocked>
+    %146 = arith.divsi %140, %cst_2 : tensor<1024xi32, #blocked>
+    %147 = arith.muli %146, %cst_1 : tensor<1024xi32, #blocked>
+    %148 = arith.addi %145, %147 : tensor<1024xi32, #blocked>
+    %149 = arith.muli %69, %cst_0 : tensor<1024xi32, #blocked>
+    %150 = arith.addi %148, %149 : tensor<1024xi32, #blocked>
+    %151 = arith.cmpi eq, %150, %11 : tensor<1024xi32, #blocked>
+    %152 = arith.select %151, %84, %cst : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
+    %153 = arith.extsi %100 : tensor<1024xi8, #blocked> to tensor<1024xi32, #blocked>
+    %154 = arith.addi %153, %cst_3 : tensor<1024xi32, #blocked>
+    %155 = arith.cmpi slt, %153, %cst_9 : tensor<1024xi32, #blocked>
+    %156 = arith.select %155, %154, %153 : tensor<1024xi1, #blocked>, tensor<1024xi32, #blocked>
+    %157 = arith.cmpi sge, %156, %cst_9 : tensor<1024xi32, #blocked>
+    %158 = arith.cmpi slt, %156, %cst_3 : tensor<1024xi32, #blocked>
+    %159 = arith.andi %157, %158 : tensor<1024xi1, #blocked>
+    tt.assert %159, "index out of bounds: 0 <= tmp15 < 9" : tensor<1024xi1, #blocked>
+    %160 = arith.muli %92, %cst_8 : tensor<1024xi32, #blocked>
+    %161 = arith.addi %156, %160 : tensor<1024xi32, #blocked>
+    %162 = arith.divsi %156, %cst_2 : tensor<1024xi32, #blocked>
+    %163 = arith.muli %162, %cst_1 : tensor<1024xi32, #blocked>
+    %164 = arith.addi %161, %163 : tensor<1024xi32, #blocked>
+    %165 = arith.addi %164, %149 : tensor<1024xi32, #blocked>
+    %166 = arith.cmpi eq, %165, %11 : tensor<1024xi32, #blocked>
+    %167 = arith.cmpi slt, %52, %61 : tensor<1024xi32, #blocked>
+    %168 = arith.cmpi slt, %85, %31 : tensor<1024xi32, #blocked>
+    %169 = arith.andi %167, %168 : tensor<1024xi1, #blocked>
+    %170 = arith.andi %169, %166 : tensor<1024xi1, #blocked>
+    %171 = arith.addf %152, %105 : tensor<1024xf32, #blocked>
+    %172 = arith.select %170, %171, %152 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
+    %173 = arith.extsi %120 : tensor<1024xi8, #blocked> to tensor<1024xi32, #blocked>
+    %174 = arith.addi %173, %cst_3 : tensor<1024xi32, #blocked>
+    %175 = arith.cmpi slt, %173, %cst_9 : tensor<1024xi32, #blocked>
+    %176 = arith.select %175, %174, %173 : tensor<1024xi1, #blocked>, tensor<1024xi32, #blocked>
+    %177 = arith.cmpi sge, %176, %cst_9 : tensor<1024xi32, #blocked>
+    %178 = arith.cmpi slt, %176, %cst_3 : tensor<1024xi32, #blocked>
+    %179 = arith.andi %177, %178 : tensor<1024xi1, #blocked>
+    tt.assert %179, "index out of bounds: 0 <= tmp33 < 9" : tensor<1024xi1, #blocked>
+    %180 = arith.addi %176, %144 : tensor<1024xi32, #blocked>
+    %181 = arith.divsi %176, %cst_2 : tensor<1024xi32, #blocked>
+    %182 = arith.muli %181, %cst_1 : tensor<1024xi32, #blocked>
+    %183 = arith.addi %180, %182 : tensor<1024xi32, #blocked>
+    %184 = arith.muli %113, %cst_0 : tensor<1024xi32, #blocked>
+    %185 = arith.addi %183, %184 : tensor<1024xi32, #blocked>
+    %186 = arith.cmpi eq, %185, %11 : tensor<1024xi32, #blocked>
+    %187 = arith.cmpi slt, %106, %61 : tensor<1024xi32, #blocked>
+    %188 = arith.cmpi slt, %22, %31 : tensor<1024xi32, #blocked>
+    %189 = arith.andi %187, %188 : tensor<1024xi1, #blocked>
+    %190 = arith.andi %189, %186 : tensor<1024xi1, #blocked>
+    %191 = arith.addf %172, %125 : tensor<1024xf32, #blocked>
+    %192 = arith.select %190, %191, %172 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
+    %193 = arith.extsi %131 : tensor<1024xi8, #blocked> to tensor<1024xi32, #blocked>
+    %194 = arith.addi %193, %cst_3 : tensor<1024xi32, #blocked>
+    %195 = arith.cmpi slt, %193, %cst_9 : tensor<1024xi32, #blocked>
+    %196 = arith.select %195, %194, %193 : tensor<1024xi1, #blocked>, tensor<1024xi32, #blocked>
+    %197 = arith.cmpi sge, %196, %cst_9 : tensor<1024xi32, #blocked>
+    %198 = arith.cmpi slt, %196, %cst_3 : tensor<1024xi32, #blocked>
+    %199 = arith.andi %197, %198 : tensor<1024xi1, #blocked>
+    tt.assert %199, "index out of bounds: 0 <= tmp49 < 9" : tensor<1024xi1, #blocked>
+    %200 = arith.addi %196, %160 : tensor<1024xi32, #blocked>
+    %201 = arith.divsi %196, %cst_2 : tensor<1024xi32, #blocked>
+    %202 = arith.muli %201, %cst_1 : tensor<1024xi32, #blocked>
+    %203 = arith.addi %200, %202 : tensor<1024xi32, #blocked>
+    %204 = arith.addi %203, %184 : tensor<1024xi32, #blocked>
+    %205 = arith.cmpi eq, %204, %11 : tensor<1024xi32, #blocked>
+    %206 = arith.andi %187, %168 : tensor<1024xi1, #blocked>
+    %207 = arith.andi %206, %205 : tensor<1024xi1, #blocked>
+    %208 = arith.addf %192, %136 : tensor<1024xf32, #blocked>
+    %209 = arith.select %207, %208, %192 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked>
+    %210 = tt.splat %out_ptr0 : !tt.ptr<f16> -> tensor<1024x!tt.ptr<f16>, #blocked>
+    %211 = tt.addptr %210, %4 : tensor<1024x!tt.ptr<f16>, #blocked>, tensor<1024xi32, #blocked>
+    %212 = arith.truncf %209 : tensor<1024xf32, #blocked> to tensor<1024xf16, #blocked>
+    %213 = ttg.convert_layout %211 : tensor<1024x!tt.ptr<f16>, #blocked> -> tensor<1024x!tt.ptr<f16>, #blocked1>
+    %214 = ttg.convert_layout %212 : tensor<1024xf16, #blocked> -> tensor<1024xf16, #blocked1>
+    tt.store %213, %214 : tensor<1024x!tt.ptr<f16>, #blocked1>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
@@ -1560,7 +1560,9 @@ void LayoutRematerialization::backwardRematerialization(
   // We measure costs in standardised milli-SM-cycles. The smem load
   // and store each cost 8 * convertLayoutBytes, and then we double
   // it to account for extra cost due to synchronisation.
-  int64_t convertLayoutCost = 32 * convertLayoutBytes;
+  // FIXME: measure cost of smem load/store and synchronisation on Intel GPUs,
+  // and refine this model further. (#5476)
+  int64_t convertLayoutCost = 32 * convertLayoutBytes * 2;
   int64_t rematerialisationCost = 0;
 
   // Evaluate single-use status for every operation in slice