@@ -207,8 +207,7 @@ Value AtomicRMWEmitter::emitAtomicRMW(RewriterBase &rewriter, Value rmwPtr,
207207
208208Value AtomicRMWEmitter::emitPairedAtomicForEvenTID (RewriterBase &rewriter,
209209 Value rmwPtr, Value valElem,
210- Value rmwMask,
211- bool checkPairs) const {
210+ Value rmwMask) const {
212211 auto loc = rmwPtr.getLoc ();
213212 auto b = TritonLLVMOpBuilder (loc, rewriter);
214213 Value i64Ones = b.i64_val (~uint64_t (0 ));
@@ -231,44 +230,34 @@ Value AtomicRMWEmitter::emitPairedAtomicForEvenTID(RewriterBase &rewriter,
231230 Value dppMoveRes = shiftLeftI32ByDpp (rewriter, packedVal);
232231 Value operand = b.bitcast (b.or_ (packedVal, dppMoveRes), packF16Ty);
233232
234- // If a runtime check is unnecessary (`checkPairs` is `false`),
235- // `rightNeighbourPtr` is irrelevant.
236- // Set the conditional value `enablePackedOpt` to `true` to enable DCE on the
237- // runtime check branch.
238- Value rightNeighbourPtr = rmwPtr;
239- Value enablePackedOpt = b.true_val ();
240- if (checkPairs) {
241- Value rightNeighbourAddr =
242- genI32TiledOp (rewriter, shiftLeftI32ByDpp, castedAddr);
243-
244- // Packing optimization only supported if following conditions are true:
245- // 1. address is aligned by 4 bytes
246- // 2. right neighbour has adjacent address
247- // 3. both threads are active
248- Value isAligned = b.icmp_eq (b.urem (castedAddr, b.i64_val (4 )), b.i64_val (0 ));
249- Value neighbourAddrAdjacent = b.icmp_eq (
250- rightNeighbourAddr,
251- b.add (castedAddr, b.i64_val (valueElemTy.getIntOrFloatBitWidth () / 8 )));
252- Value neighbourEnabled = b.icmp_ne (i64Ones, rightNeighbourAddr);
253- Value bothEnabled = b.and_ (neighbourEnabled, rmwMask);
254- enablePackedOpt =
255- b.and_ (b.and_ (isAligned, bothEnabled), neighbourAddrAdjacent);
256-
257- // Enable only the even threads.
258- Value anyEnabled = b.or_ (neighbourEnabled, rmwMask);
259- // If one of the threads is disabled, use the neighbour's addr.
260- rightNeighbourAddr =
261- b.select (neighbourEnabled, rightNeighbourAddr, castedAddr);
262- castedAddr = b.select (rmwMask, castedAddr, rightNeighbourAddr);
263-
264- rmwMask = b.and_ (anyEnabled, b.icmp_eq (isOddI32, b.i32_val (0 )));
265-
266- // Unpack results back
267- rightNeighbourPtr = b.inttoptr (rmwPtr.getType (), rightNeighbourAddr);
268- rmwPtr = b.inttoptr (rmwPtr.getType (), castedAddr);
269- } else {
270- rmwMask = b.and_ (rmwMask, b.icmp_eq (isOddI32, b.i32_val (0 )));
271- }
233+ Value rightNeighbourAddr =
234+ genI32TiledOp (rewriter, shiftLeftI32ByDpp, castedAddr);
235+
236+ // Packing optimization only supported if following conditions are true:
237+ // 1. address is aligned by 4 bytes
238+ // 2. right neighbour has adjacent address
239+ // 3. both threads are active
240+ Value isAligned = b.icmp_eq (b.urem (castedAddr, b.i64_val (4 )), b.i64_val (0 ));
241+ Value neighbourAddrAdjacent = b.icmp_eq (
242+ rightNeighbourAddr,
243+ b.add (castedAddr, b.i64_val (valueElemTy.getIntOrFloatBitWidth () / 8 )));
244+ Value neighbourEnabled = b.icmp_ne (i64Ones, rightNeighbourAddr);
245+ Value bothEnabled = b.and_ (neighbourEnabled, rmwMask);
246+ Value enablePackedOpt =
247+ b.and_ (b.and_ (isAligned, bothEnabled), neighbourAddrAdjacent);
248+
249+ // Enable only the even threads.
250+ Value anyEnabled = b.or_ (neighbourEnabled, rmwMask);
251+ // If one of the threads is disabled, use the neighbour's addr.
252+ rightNeighbourAddr =
253+ b.select (neighbourEnabled, rightNeighbourAddr, castedAddr);
254+ castedAddr = b.select (rmwMask, castedAddr, rightNeighbourAddr);
255+
256+ rmwMask = b.and_ (anyEnabled, b.icmp_eq (isOddI32, b.i32_val (0 )));
257+
258+ // Unpack results back
259+ Value rightNeighbourPtr = b.inttoptr (rmwPtr.getType (), rightNeighbourAddr);
260+ rmwPtr = b.inttoptr (rmwPtr.getType (), castedAddr);
272261
273262 Value undefVal = b.undef (packF16Ty);
274263 // Build blocks to bypass the atomic instruction for ~rmwMask.
0 commit comments