Update getLaneAndWarpId and getThreadId (#4715)

etiotto · web-flow · commit 877cf7a9751c · 2025-07-16T10:08:11.000-04:00
Update `getLaneAndWarpId` and `getThreadId` and make `getLaneId` use
`getLaneAndWarpId`

---------

Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -298,42 +298,53 @@ Value getThreadId(OpBuilder &rewriter, Location loc) {
       rewriter.create<::mlir::gpu::ThreadIdOp>(loc, ::mlir::gpu::Dimension::x);
   tid = rewriter.create<arith::IndexCastOp>(loc, i32_ty, tid);
 
+  Operation *lookupPt = &rewriter.getInsertionBlock()->front();
+  int threadsPerWarp = triton::gpu::lookupThreadsPerWarp(rewriter);
+  int numWarps = triton::gpu::lookupNumWarps(lookupPt);
+  int upperBound = numWarps * threadsPerWarp;
+
+  TritonLLVMOpBuilder b(loc, rewriter);
+
   // If this is being created inside a warp specialize op, compute the relative
   // thread ID within the warp group.
   if (std::optional<int> startId =
           getWarpGroupStartThreadId(rewriter.getInsertionBlock())) {
-    TritonLLVMOpBuilder b(loc, rewriter);
     tid = rewriter.create<arith::SubIOp>(loc, tid, b.i32_val(*startId));
   }
 
-  return tid;
-}
+  if (llvm::isPowerOf2_32(upperBound)) {
+    // help LLVM's known bits analysis:
+    tid = b.and_(tid, b.i32_val(upperBound - 1));
+  }
 
-Value getLaneId(OpBuilder &rewriter, Location loc) {
-  TritonLLVMOpBuilder b(loc, rewriter);
-  Value tid = getThreadId(rewriter, loc);
-  int threadsPerWarp = triton::gpu::lookupThreadsPerWarp(rewriter);
-  return b.urem(tid, b.i32_val(threadsPerWarp));
+  return tid;
 }
 
 std::pair<Value, Value> getLaneAndWarpId(OpBuilder &rewriter, Location loc) {
   TritonLLVMOpBuilder b(loc, rewriter);
   Value tid = getThreadId(rewriter, loc);
   int threadsPerWarp = triton::gpu::lookupThreadsPerWarp(rewriter);
   Value warpSizeVal = b.i32_val(threadsPerWarp);
-  Value laneId = b.urem(tid, warpSizeVal);
 
   // If there is only one warp, the warp ID is always 0.
   Operation *lookupPt = &rewriter.getInsertionBlock()->front();
+  Value laneId;
   Value warpId;
-  if (triton::gpu::lookupNumWarps(lookupPt) == 1)
+  if (triton::gpu::lookupNumWarps(lookupPt) == 1) {
+    laneId = tid;
     warpId = b.i32_val(0);
-  else
+  } else {
+    laneId = b.urem(tid, warpSizeVal);
     warpId = b.udiv(tid, warpSizeVal);
+  }
 
   return {laneId, warpId};
 }
 
+Value getLaneId(OpBuilder &rewriter, Location loc) {
+  return getLaneAndWarpId(rewriter, loc).first;
+}
+
 // Helper function: applies linear layout vectorized over register indices
 SmallVector<SmallVector<std::pair<StringAttr, Value>>>
 applyLinearLayoutVec(Location loc, RewriterBase &rewriter,
diff --git a/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir b/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir
@@ -147,6 +147,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     // The first constant 0 skips the LDS offset which is also 0
     // COMMON: llvm.getelementptr
     // COMMON: llvm.mlir.constant(0 : i32) : i32
+    // COMMON: llvm.mlir.constant(0 : i32) : i32
     // COMMON: %[[aux_ca:.*]] = llvm.mlir.constant(0 : i32) : i32
     // COMMON: rocdl.raw.ptr.buffer.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_ca]]
     %1 = amdgpu.buffer_load_to_local %arg0[%0] cacheModifier = ca into %arg2: <f32>[tensor<64xi32, #blocked>] -> <64xf32, #shared, #smem, mutable>
diff --git a/test/Conversion/amd/mfma-shortcut.mlir b/test/Conversion/amd/mfma-shortcut.mlir
@@ -47,7 +47,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     // GFX942-DAG: [[c64:%.*]] = llvm.mlir.constant(64 : i32)
 
     // GFX942: [[threadId:%.*]] = rocdl.workitem.id.x
-    // GFX942: [[laneId:%.*]] = llvm.urem [[threadId]], [[c64]]
+    // GFX942: [[c255:%.*]] = llvm.mlir.constant(255 : i32)
+    // GFX942: [[RTID:%.*]] = llvm.and [[threadId]], [[c255]]
+    // GFX942: [[laneId:%.*]] = llvm.urem [[RTID]], [[c64]]
     // GFX942: [[mask0:%.*]] = llvm.icmp "slt" [[laneId]], [[c32]]
 
     // GFX942: [[shflLaneId:%.*]] = llvm.add [[laneId]], [[c32]]
@@ -128,7 +130,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     // GFX942-DAG: [[c64:%.*]] = llvm.mlir.constant(64 : i32)
 
     // GFX942: [[threadId:%.*]] = rocdl.workitem.id.x
-    // GFX942: [[laneId:%.*]] = llvm.urem [[threadId]], [[c64]]
+    // GFX942: [[c255:%.*]] = llvm.mlir.constant(255 : i32)
+    // GFX942: [[RTID:%.*]] = llvm.and [[threadId]], [[c255]]
+    // GFX942: [[laneId:%.*]] = llvm.urem [[RTID]], [[c64]]
     // GFX942: [[mask0:%.*]] = llvm.icmp "slt" [[laneId]], [[c32]]
 
     // GFX942: [[laneIdRem:%.*]] = llvm.urem [[laneId]], [[c32]]
diff --git a/test/Conversion/intel/dot_layout_offset.mlir b/test/Conversion/intel/dot_layout_offset.mlir
@@ -11,9 +11,11 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
     // COM: Base index of the dot layout.
     // CHECK:           %[[THREAD_ID_I64:.*]] = llvm.call spir_funccc @_Z12get_local_idj
     // CHECK:           %[[THREAD_ID_I32:.*]] = llvm.trunc %[[THREAD_ID_I64]] : i64 to i32
+    // CHECK:           %[[CST_63:.*]] = llvm.mlir.constant(63 : i32) : i32
+    // CHECK:           %[[RTID:.*]] = llvm.and %[[THREAD_ID_I32]], %[[CST_63]] : i32
     // CHECK:           %[[VAL_145:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_I32]], %[[VAL_145]]  : i32
-    // CHECK:           %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_I32]], %[[VAL_145]]  : i32
+    // CHECK:           %[[LANE_ID:.*]] = llvm.urem %[[RTID]], %[[VAL_145]]  : i32
+    // CHECK:           %[[WARP_ID:.*]] = llvm.udiv %[[RTID]], %[[VAL_145]]  : i32
     // CHECK-COUNT-4:   %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK:           %[[VAL_149:.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK:           %[[VAL_150:.*]] = llvm.and %[[LANE_ID]], %[[VAL_149]]  : i32
@@ -333,9 +335,11 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32, "ttg.thr
     // COM: Base index of the dot layout.
     // CHECK:           %[[THREAD_ID_I64:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[VAL_142]])
     // CHECK:           %[[THREAD_ID_I32:.*]] = llvm.trunc %[[THREAD_ID_I64]] : i64 to i32
-    // CHECK:           %[[VAL_145:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_I32]], %[[VAL_145]]  : i32
-    // CHECK:           %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_I32]], %[[VAL_145]]  : i32
+    // CHECK-DAG:       %[[CST_63:.*]] = llvm.mlir.constant(63 : i32) : i32
+    // CHECK-DAG:       %[[RTID:.*]] = llvm.and %[[THREAD_ID_32:.*]], %[[CST_63]] : i32
+    // CHECK-DAG:       %[[VAL_145:.*]] = llvm.mlir.constant(16 : i32) : i32
+    // CHECK-DAG:       %[[LANE_ID:.*]] = llvm.urem %[[RTID]], %[[VAL_145]]  : i32
+    // CHECK-DAG:       %[[WARP_ID:.*]] = llvm.udiv %[[RTID]], %[[VAL_145]]  : i32
     // CHECK-COUNT-4:   %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK:           %[[VAL_149:.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK:           %[[VAL_150:.*]] = llvm.and %[[LANE_ID]], %[[VAL_149]]  : i32
diff --git a/test/Conversion/intel/dpas_to_block_layout_convert.mlir b/test/Conversion/intel/dpas_to_block_layout_convert.mlir
@@ -10,24 +10,26 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
   tt.func public @convert_dpas(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf16, #mma>
 
-    // CHECK-DAG:           %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32
-    // CHECK-DAG:           %[[CST_16384:.*]] = llvm.mlir.constant(16384 : i32) : i32
-    // CHECK-DAG:           %[[CST_8192:.*]] = llvm.mlir.constant(8192 : i32) : i32
-    // CHECK-DAG:           %[[CST_128:.*]] = llvm.mlir.constant(128 : i32) : i32
-    // CHECK-DAG:           %[[CST_64:.*]] = llvm.mlir.constant(64 : i32) : i32
-    // CHECK-DAG:           %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK-DAG:           %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK-DAG:           %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
-    // CHECK-DAG:           %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-DAG:           %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-DAG:           %[[SMEM:.*]] = llvm.mlir.addressof @global_smem : !llvm.ptr<3>
-    // CHECK-DAG:           %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK-DAG:           %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-DAG:       %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32
+    // CHECK-DAG:       %[[CST_16384:.*]] = llvm.mlir.constant(16384 : i32) : i32
+    // CHECK-DAG:       %[[CST_8192:.*]] = llvm.mlir.constant(8192 : i32) : i32
+    // CHECK-DAG:       %[[CST_128:.*]] = llvm.mlir.constant(128 : i32) : i32
+    // CHECK-DAG:       %[[CST_64:.*]] = llvm.mlir.constant(64 : i32) : i32
+    // CHECK-DAG:       %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
+    // CHECK-DAG:       %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK-DAG:       %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
+    // CHECK-DAG:       %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG:       %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-DAG:       %[[SMEM:.*]] = llvm.mlir.addressof @global_smem : !llvm.ptr<3>
+    // CHECK-DAG:       %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
+    // CHECK-DAG:       %[[CST_511:.*]] = llvm.mlir.constant(511 : i32) : i32
+    // CHECK-DAG:       %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // COM: The following operations is generated for the conversion of DPAS layout to blocked layout.  The conversion replica size is 128*256. So there is 1 round of load/store with synchronization.
     // CHECK:           %[[threadId_64:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]]) {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return} : (i32) -> i64
     // CHECK:           %[[threadId:.*]] = llvm.trunc %[[threadId_64]] : i64 to i32
-    // CHECK:           %[[laneId:.*]] = llvm.urem %[[threadId]], %[[CST_16]]  : i32
-    // CHECK:           %[[warpId:.*]] = llvm.udiv %[[threadId]], %[[CST_16]]  : i32
+    // CHECK:           %[[rtid:.*]] = llvm.and %[[threadId:.*]], %[[CST_511]] : i32
+    // CHECK:           %[[laneId:.*]] = llvm.urem %[[rtid]], %[[CST_16]]  : i32
+    // CHECK:           %[[warpId:.*]] = llvm.udiv %[[rtid]], %[[CST_16]]  : i32
     // CHECK:           %[[VAL_25:.*]] = llvm.and %[[laneId]], %[[CST_1]] : i32
     // CHECK:           %[[VAL_26:.*]] = llvm.icmp "eq" %[[VAL_25]], %[[CST_0]] : i32
     // CHECK:           %[[VAL_27:.*]] = llvm.select %[[VAL_26]], %[[CST_0]], %[[CST_1]] : i1, i32
@@ -115,12 +117,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
     // CHECK-DAG:           %[[SMEM:.*]] = llvm.mlir.addressof @global_smem : !llvm.ptr<3>
     // CHECK-DAG:           %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
     // CHECK-DAG:           %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-DAG:           %[[CST_511:.*]] = llvm.mlir.constant(511 : i32) : i32
 
     // COM: The following operations is generated for the conversion of DPAS layout to blocked layout. The conversion replica size is 64*256. So there are 2 round of load/store with synchronization.
     // CHECK:           %[[threadId_64:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]]) {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return} : (i32) -> i64
     // CHECK:           %[[threadId:.*]] = llvm.trunc %[[threadId_64]] : i64 to i32
-    // CHECK:           %[[laneId:.*]] = llvm.urem %[[threadId]], %[[CST_16]]  : i32
-    // CHECK:           %[[warpId:.*]] = llvm.udiv %[[threadId]], %[[CST_16]]  : i32
+    // CHECK:           %[[rtid:.*]] = llvm.and %[[threadId]], %[[CST_511]] : i32
+    // CHECK:           %[[laneId:.*]] = llvm.urem %[[rtid]], %[[CST_16]]  : i32
+    // CHECK:           %[[warpId:.*]] = llvm.udiv %[[rtid]], %[[CST_16]]  : i32
     // CHECK:           %[[VAL_25:.*]] = llvm.and %[[laneId]], %[[CST_1]] : i32
     // CHECK:           %[[VAL_26:.*]] = llvm.icmp "eq" %[[VAL_25]], %[[CST_0]] : i32
     // CHECK:           %[[VAL_27:.*]] = llvm.select %[[VAL_26]], %[[CST_0]], %[[CST_1]] : i1, i32
diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir
@@ -1176,7 +1176,8 @@ module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warp
   // CHECK-LABEL: atomic_add_f32_scalar_no_store
   tt.func @atomic_add_f32_scalar_no_store(%arg0 : !tt.ptr<f32>, %arg1 : i1, %arg2 : f32) {
     // CHECK:      [[ZERO0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:      [[MASKLANE:%.*]] = llvm.and
+    // CHECK:      [[CST_NEG_ONE:%.*]] = llvm.mlir.constant(-1 : i32) : i32
+    // CHECK-NEXT: [[MASKLANE:%.*]] = llvm.and
     // CHECK-NEXT: [[CMPLANE:%.*]] = llvm.icmp "eq" [[MASKLANE]], [[ZERO0]]
     // CHECK:      [[MASKWARP:%.*]] = llvm.and
     // CHECK-NEXT: [[CMPWARP:%.*]] = llvm.icmp "eq" [[MASKWARP]], [[ZERO0]]
@@ -1212,7 +1213,8 @@ module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warp
   // CHECK-LABEL: atomic_add_f32_scalar
   tt.func @atomic_add_f32_scalar(%arg0 : !tt.ptr<f32>, %arg1 : i1, %arg2 : f32) {
     // CHECK:      [[ZERO0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:      [[MASKLANE:%.*]] = llvm.and
+    // CHECK:      [[CST_NEG_ONE:%.*]] = llvm.mlir.constant(-1 : i32) : i32
+    // CHECK-NEXT: [[MASKLANE:%.*]] = llvm.and
     // CHECK-NEXT: [[CMPLANE:%.*]] = llvm.icmp "eq" [[MASKLANE]], [[ZERO0]]
     // CHECK:      [[MASKWARP:%.*]] = llvm.and
     // CHECK-NEXT: [[CMPWARP:%.*]] = llvm.icmp "eq" [[MASKWARP]], [[ZERO0]]
diff --git a/test/Conversion/tritonnvidiagpu_to_llvm.mlir b/test/Conversion/tritonnvidiagpu_to_llvm.mlir
@@ -28,8 +28,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: arrive_barrier
   tt.func @arrive_barrier(%alloc: !ttg.memdesc<1xi64, #shared0, #smem>) {
     // CHECK-NEXT: [[TID:%.*]] = nvvm.read.ptx.sreg.tid.x
+    // CHECK-NEXT: [[C127:%.*]] = llvm.mlir.constant(127 : i32)
+    // CHECK-NEXT: [[RTID:%.*]] = llvm.and [[TID]], [[C127]]
     // CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
-    // CHECK-NEXT: [[IS_ZERO:%.*]] = llvm.icmp "eq" [[TID]], [[C0]]
+    // CHECK-NEXT: [[IS_ZERO:%.*]] = llvm.icmp "eq" [[RTID]], [[C0]]
     // CHECK-NEXT: "@$0 mbarrier.arrive.shared::cta.b64 _, [$1], 2;", "b,r" [[IS_ZERO]], %arg0
     ttng.arrive_barrier %alloc, 2 : !ttg.memdesc<1xi64, #shared0, #smem>
     tt.return
@@ -38,8 +40,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: arrive_barrier_pred
   tt.func @arrive_barrier_pred(%alloc: !ttg.memdesc<1xi64, #shared0, #smem>, %pred: i1) {
     // CHECK-NEXT: [[TID:%.*]] = nvvm.read.ptx.sreg.tid.x
+    // CHECK-NEXT: [[C127:%.*]] = llvm.mlir.constant(127 : i32)
+    // CHECK-NEXT: [[RTID:%.*]] = llvm.and [[TID]], [[C127]]
     // CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
-    // CHECK-NEXT: [[IS_ZERO:%.*]] = llvm.icmp "eq" [[TID]], [[C0]]
+    // CHECK-NEXT: [[IS_ZERO:%.*]] = llvm.icmp "eq" [[RTID]], [[C0]]
     // CHECK-NEXT: [[PRED:%.*]] = llvm.and [[IS_ZERO]], %arg1
     // CHECK-NEXT: "@$0 mbarrier.arrive.shared::cta.b64 _, [$1], 2;", "b,r" [[PRED]], %arg0
     ttng.arrive_barrier %alloc, 2, %pred : !ttg.memdesc<1xi64, #shared0, #smem>
diff --git a/test/TritonIntelGPU/blockptr_store.mlir b/test/TritonIntelGPU/blockptr_store.mlir
@@ -244,10 +244,12 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
       %0 = tt.make_tensor_ptr %arg0, [%c64_i64, %c64_i64], [%c1_i64, %col_stride], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x16xf16, #blocked>>
       // CHECK: llvm.call spir_funccc @_Z12get_local_idj
       // CHECK-NOT: llvm.icmp "slt"
-      // CHECK: %[[threadID:.*]] = llvm.call spir_funccc @_Z12get_local_idj
-      // CHECK: %[[VAL_583:.*]] = llvm.trunc %[[threadID]] : i64 to i32
-      // CHECK: %[[VAL_584:.*]] = llvm.mlir.constant(16 : i32) : i32
-      // CHECK: %[[VAL_586:.*]] = llvm.udiv %[[VAL_583]], %[[VAL_584]] : i32
+      // CHECK: %[[THREAD_ID:.*]] = llvm.call spir_funccc @_Z12get_local_idj
+      // CHECK: %[[THREAD_ID_32:.*]] = llvm.trunc %[[THREAD_ID]] : i64 to i32
+      // CHECK-DAG: %[[CST_127:.*]] = llvm.mlir.constant(127 : i32) : i32
+      // CHECK-DAG: %[[RTID:.*]] = llvm.and %[[THREAD_ID_32:.*]], %[[CST_127]] : i32
+      // CHECK-DAG: %[[VAL_584:.*]] = llvm.mlir.constant(16 : i32) : i32
+      // CHECK: %[[VAL_586:.*]] = llvm.udiv %[[RTID]], %[[VAL_584]] : i32
       // CHECK: %[[VAL_587:.*]] = llvm.mlir.constant(3 : i32) : i32
       // CHECK: %[[VAL_588:.*]] = llvm.and %[[VAL_586]], %[[VAL_587]] : i32
       // CHECK: %[[threadPred:.*]] = llvm.icmp "eq" %[[VAL_588]], {{.*}} : i32
diff --git a/test/TritonIntelGPU/tensor-pointer-load-block-2d.mlir b/test/TritonIntelGPU/tensor-pointer-load-block-2d.mlir
@@ -323,8 +323,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
     // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK: [[THREADID_i64:%.*]] = llvm.call spir_funccc @_Z12get_local_idj([[C0]])
     // CHECK: [[THREADID:%.*]] = llvm.trunc [[THREADID_i64]] : i64 to i32
+    // CHECK: [[C127:%.*]] = llvm.mlir.constant(127 : i32) : i32
+    // CHECK: [[RTID:%.*]] = llvm.and [[THREADID]], [[C127]] : i32
     // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK: [[REM:%.*]] = llvm.urem [[THREADID]], [[C8]] : i32
+    // CHECK: [[REM:%.*]] = llvm.urem [[RTID]], [[C8]] : i32
+
     // CHECK: [[NEWVAL:%.*]] = llvm.call spir_funccc @_Z17sub_group_shuffleij([[OLDVAL]], [[REM]])
     // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK: [[VEC1:%.*]] = llvm.insertelement [[NEWVAL]], [[VEC]][[[C0]] : i32] : vector<2xi32>
@@ -334,8 +337,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
     // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK: [[THREADID_i64:%.*]] = llvm.call spir_funccc @_Z12get_local_idj([[C0]])
     // CHECK: [[THREADID:%.*]] = llvm.trunc [[THREADID_i64]] : i64 to i32
+    // CHECK: [[C127:%.*]] = llvm.mlir.constant(127 : i32) : i32
+    // CHECK: [[RTID:%.*]] = llvm.and [[THREADID]], [[C127]] : i32
     // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK: [[REM:%.*]] = llvm.urem [[THREADID]], [[C8]] : i32
+    // CHECK: [[REM:%.*]] = llvm.urem [[RTID]], [[C8]] : i32
     // CHECK: [[NEWVAL:%.*]] = llvm.call spir_funccc @_Z17sub_group_shuffleij([[OLDVAL]], [[REM]])
     // CHECK: [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK: [[VEC2:%.*]] = llvm.insertelement [[NEWVAL]], [[VEC1]][[[C1]] : i32] : vector<2xi32>
diff --git a/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir b/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir