@@ -207,8 +207,7 @@ Value AtomicRMWEmitter::emitAtomicRMW(RewriterBase &rewriter, Value rmwPtr,
207
207
208
208
Value AtomicRMWEmitter::emitPairedAtomicForEvenTID (RewriterBase &rewriter,
209
209
Value rmwPtr, Value valElem,
210
- Value rmwMask,
211
- bool checkPairs) const {
210
+ Value rmwMask) const {
212
211
auto loc = rmwPtr.getLoc ();
213
212
auto b = TritonLLVMOpBuilder (loc, rewriter);
214
213
Value i64Ones = b.i64_val (~uint64_t (0 ));
@@ -231,44 +230,34 @@ Value AtomicRMWEmitter::emitPairedAtomicForEvenTID(RewriterBase &rewriter,
231
230
Value dppMoveRes = shiftLeftI32ByDpp (rewriter, packedVal);
232
231
Value operand = b.bitcast (b.or_ (packedVal, dppMoveRes), packF16Ty);
233
232
234
- // If a runtime check is unnecessary (`checkPairs` is `false`),
235
- // `rightNeighbourPtr` is irrelevant.
236
- // Set the conditional value `enablePackedOpt` to `true` to enable DCE on the
237
- // runtime check branch.
238
- Value rightNeighbourPtr = rmwPtr;
239
- Value enablePackedOpt = b.true_val ();
240
- if (checkPairs) {
241
- Value rightNeighbourAddr =
242
- genI32TiledOp (rewriter, shiftLeftI32ByDpp, castedAddr);
243
-
244
- // Packing optimization only supported if following conditions are true:
245
- // 1. address is aligned by 4 bytes
246
- // 2. right neighbour has adjacent address
247
- // 3. both threads are active
248
- Value isAligned = b.icmp_eq (b.urem (castedAddr, b.i64_val (4 )), b.i64_val (0 ));
249
- Value neighbourAddrAdjacent = b.icmp_eq (
250
- rightNeighbourAddr,
251
- b.add (castedAddr, b.i64_val (valueElemTy.getIntOrFloatBitWidth () / 8 )));
252
- Value neighbourEnabled = b.icmp_ne (i64Ones, rightNeighbourAddr);
253
- Value bothEnabled = b.and_ (neighbourEnabled, rmwMask);
254
- enablePackedOpt =
255
- b.and_ (b.and_ (isAligned, bothEnabled), neighbourAddrAdjacent);
256
-
257
- // Enable only the even threads.
258
- Value anyEnabled = b.or_ (neighbourEnabled, rmwMask);
259
- // If one of the threads is disabled, use the neighbour's addr.
260
- rightNeighbourAddr =
261
- b.select (neighbourEnabled, rightNeighbourAddr, castedAddr);
262
- castedAddr = b.select (rmwMask, castedAddr, rightNeighbourAddr);
263
-
264
- rmwMask = b.and_ (anyEnabled, b.icmp_eq (isOddI32, b.i32_val (0 )));
265
-
266
- // Unpack results back
267
- rightNeighbourPtr = b.inttoptr (rmwPtr.getType (), rightNeighbourAddr);
268
- rmwPtr = b.inttoptr (rmwPtr.getType (), castedAddr);
269
- } else {
270
- rmwMask = b.and_ (rmwMask, b.icmp_eq (isOddI32, b.i32_val (0 )));
271
- }
233
+ Value rightNeighbourAddr =
234
+ genI32TiledOp (rewriter, shiftLeftI32ByDpp, castedAddr);
235
+
236
+ // Packing optimization only supported if following conditions are true:
237
+ // 1. address is aligned by 4 bytes
238
+ // 2. right neighbour has adjacent address
239
+ // 3. both threads are active
240
+ Value isAligned = b.icmp_eq (b.urem (castedAddr, b.i64_val (4 )), b.i64_val (0 ));
241
+ Value neighbourAddrAdjacent = b.icmp_eq (
242
+ rightNeighbourAddr,
243
+ b.add (castedAddr, b.i64_val (valueElemTy.getIntOrFloatBitWidth () / 8 )));
244
+ Value neighbourEnabled = b.icmp_ne (i64Ones, rightNeighbourAddr);
245
+ Value bothEnabled = b.and_ (neighbourEnabled, rmwMask);
246
+ Value enablePackedOpt =
247
+ b.and_ (b.and_ (isAligned, bothEnabled), neighbourAddrAdjacent);
248
+
249
+ // Enable only the even threads.
250
+ Value anyEnabled = b.or_ (neighbourEnabled, rmwMask);
251
+ // If one of the threads is disabled, use the neighbour's addr.
252
+ rightNeighbourAddr =
253
+ b.select (neighbourEnabled, rightNeighbourAddr, castedAddr);
254
+ castedAddr = b.select (rmwMask, castedAddr, rightNeighbourAddr);
255
+
256
+ rmwMask = b.and_ (anyEnabled, b.icmp_eq (isOddI32, b.i32_val (0 )));
257
+
258
+ // Unpack results back
259
+ Value rightNeighbourPtr = b.inttoptr (rmwPtr.getType (), rightNeighbourAddr);
260
+ rmwPtr = b.inttoptr (rmwPtr.getType (), castedAddr);
272
261
273
262
Value undefVal = b.undef (packF16Ty);
274
263
// Build blocks to bypass the atomic instruction for ~rmwMask.
0 commit comments