@@ -135,6 +135,9 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
135135 return ;
136136 case AtomicRMWInst::Add:
137137 case AtomicRMWInst::Sub:
138+ case AtomicRMWInst::And:
139+ case AtomicRMWInst::Or:
140+ case AtomicRMWInst::Xor:
138141 case AtomicRMWInst::Max:
139142 case AtomicRMWInst::Min:
140143 case AtomicRMWInst::UMax:
@@ -185,6 +188,21 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
185188 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
186189 Op = AtomicRMWInst::Sub;
187190 break ;
191+ case Intrinsic::amdgcn_buffer_atomic_and:
192+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
193+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
194+ Op = AtomicRMWInst::And;
195+ break ;
196+ case Intrinsic::amdgcn_buffer_atomic_or:
197+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
198+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
199+ Op = AtomicRMWInst::Or;
200+ break ;
201+ case Intrinsic::amdgcn_buffer_atomic_xor:
202+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
203+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
204+ Op = AtomicRMWInst::Xor;
205+ break ;
188206 case Intrinsic::amdgcn_buffer_atomic_smin:
189207 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
190208 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
@@ -248,6 +266,12 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
248266 return B.CreateBinOp (Instruction::Add, LHS, RHS);
249267 case AtomicRMWInst::Sub:
250268 return B.CreateBinOp (Instruction::Sub, LHS, RHS);
269+ case AtomicRMWInst::And:
270+ return B.CreateBinOp (Instruction::And, LHS, RHS);
271+ case AtomicRMWInst::Or:
272+ return B.CreateBinOp (Instruction::Or, LHS, RHS);
273+ case AtomicRMWInst::Xor:
274+ return B.CreateBinOp (Instruction::Xor, LHS, RHS);
251275
252276 case AtomicRMWInst::Max:
253277 Pred = CmpInst::ICMP_SGT;
@@ -273,8 +297,11 @@ static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
273297 llvm_unreachable (" Unhandled atomic op" );
274298 case AtomicRMWInst::Add:
275299 case AtomicRMWInst::Sub:
300+ case AtomicRMWInst::Or:
301+ case AtomicRMWInst::Xor:
276302 case AtomicRMWInst::UMax:
277303 return APInt::getMinValue (BitWidth);
304+ case AtomicRMWInst::And:
278305 case AtomicRMWInst::UMin:
279306 return APInt::getMaxValue (BitWidth);
280307 case AtomicRMWInst::Max:
@@ -340,10 +367,10 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
340367 Value *const ExtractHi = B.CreateExtractElement (BitCast, B.getInt32 (1 ));
341368 CallInst *const PartialMbcnt = B.CreateIntrinsic (
342369 Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32 (0 )});
343- CallInst *const Mbcnt = B. CreateIntrinsic (Intrinsic::amdgcn_mbcnt_hi, {},
344- {ExtractHi, PartialMbcnt});
345-
346- Value * const MbcntCast = B. CreateIntCast (Mbcnt, Ty, false );
370+ Value *const Mbcnt =
371+ B. CreateIntCast (B. CreateIntrinsic (Intrinsic::amdgcn_mbcnt_hi, {},
372+ {ExtractHi, PartialMbcnt}),
373+ Ty, false );
347374
348375 Value *const Identity = B.getInt (getIdentityValueForAtomicOp (Op, TyBitWidth));
349376
@@ -417,32 +444,39 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
417444
418445 case AtomicRMWInst::Add:
419446 case AtomicRMWInst::Sub: {
420- // Get the total number of active lanes we have by using popcount.
421- Instruction *const Ctpop =
422- B.CreateUnaryIntrinsic (Intrinsic::ctpop, Ballot);
423- Value *const CtpopCast = B.CreateIntCast (Ctpop, Ty, false );
424-
425- // Calculate the new value we will be contributing to the atomic operation
426- // for the entire wavefront.
427- NewV = B.CreateMul (V, CtpopCast);
447+ // The new value we will be contributing to the atomic operation is the
448+ // old value times the number of active lanes.
449+ Value *const Ctpop = B.CreateIntCast (
450+ B.CreateUnaryIntrinsic (Intrinsic::ctpop, Ballot), Ty, false );
451+ NewV = B.CreateMul (V, Ctpop);
428452 break ;
429453 }
430454
455+ case AtomicRMWInst::And:
456+ case AtomicRMWInst::Or:
431457 case AtomicRMWInst::Max:
432458 case AtomicRMWInst::Min:
433459 case AtomicRMWInst::UMax:
434460 case AtomicRMWInst::UMin:
435- // Max/min with a uniform value is idempotent: doing the atomic operation
436- // multiple times has the same effect as doing it once.
461+ // These operations with a uniform value are idempotent: doing the atomic
462+ // operation multiple times has the same effect as doing it once.
437463 NewV = V;
438464 break ;
465+
466+ case AtomicRMWInst::Xor:
467+ // The new value we will be contributing to the atomic operation is the
468+ // old value times the parity of the number of active lanes.
469+ Value *const Ctpop = B.CreateIntCast (
470+ B.CreateUnaryIntrinsic (Intrinsic::ctpop, Ballot), Ty, false );
471+ NewV = B.CreateMul (V, B.CreateAnd (Ctpop, 1 ));
472+ break ;
439473 }
440474 }
441475
442476 // We only want a single lane to enter our new control flow, and we do this
443477 // by checking if there are any active lanes below us. Only one lane will
444478 // have 0 active lanes below us, so that will be the only one to progress.
445- Value *const Cond = B.CreateICmpEQ (MbcntCast , B.getIntN (TyBitWidth, 0 ));
479+ Value *const Cond = B.CreateICmpEQ (Mbcnt , B.getIntN (TyBitWidth, 0 ));
446480
447481 // Store I's original basic block before we split the block.
448482 BasicBlock *const EntryBB = I.getParent ();
@@ -511,14 +545,19 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
511545 llvm_unreachable (" Unhandled atomic op" );
512546 case AtomicRMWInst::Add:
513547 case AtomicRMWInst::Sub:
514- LaneOffset = B.CreateMul (V, MbcntCast );
548+ LaneOffset = B.CreateMul (V, Mbcnt );
515549 break ;
550+ case AtomicRMWInst::And:
551+ case AtomicRMWInst::Or:
516552 case AtomicRMWInst::Max:
517553 case AtomicRMWInst::Min:
518554 case AtomicRMWInst::UMax:
519555 case AtomicRMWInst::UMin:
520556 LaneOffset = B.CreateSelect (Cond, Identity, V);
521557 break ;
558+ case AtomicRMWInst::Xor:
559+ LaneOffset = B.CreateMul (V, B.CreateAnd (Mbcnt, 1 ));
560+ break ;
522561 }
523562 }
524563 Value *const Result = buildNonAtomicBinOp (B, Op, BroadcastI, LaneOffset);
0 commit comments