Skip to content

Commit 5d8fc59

Browse files
foadjayfoad
authored andcommitted
[AMDGPU] Optimize atomic AND/OR/XOR
Summary: Extend the atomic optimizer to handle AND, OR and XOR. Reviewers: arsenm, sheredom Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64809 Change-Id: I13d4ba5850c69f57778854fe5439767e6efad02b
1 parent 9bc5dd4 commit 5d8fc59

File tree

2 files changed

+91
-16
lines changed

2 files changed

+91
-16
lines changed

lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
135135
return;
136136
case AtomicRMWInst::Add:
137137
case AtomicRMWInst::Sub:
138+
case AtomicRMWInst::And:
139+
case AtomicRMWInst::Or:
140+
case AtomicRMWInst::Xor:
138141
case AtomicRMWInst::Max:
139142
case AtomicRMWInst::Min:
140143
case AtomicRMWInst::UMax:
@@ -185,6 +188,21 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
185188
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
186189
Op = AtomicRMWInst::Sub;
187190
break;
191+
case Intrinsic::amdgcn_buffer_atomic_and:
192+
case Intrinsic::amdgcn_struct_buffer_atomic_and:
193+
case Intrinsic::amdgcn_raw_buffer_atomic_and:
194+
Op = AtomicRMWInst::And;
195+
break;
196+
case Intrinsic::amdgcn_buffer_atomic_or:
197+
case Intrinsic::amdgcn_struct_buffer_atomic_or:
198+
case Intrinsic::amdgcn_raw_buffer_atomic_or:
199+
Op = AtomicRMWInst::Or;
200+
break;
201+
case Intrinsic::amdgcn_buffer_atomic_xor:
202+
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
203+
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
204+
Op = AtomicRMWInst::Xor;
205+
break;
188206
case Intrinsic::amdgcn_buffer_atomic_smin:
189207
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
190208
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
@@ -248,6 +266,12 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
248266
return B.CreateBinOp(Instruction::Add, LHS, RHS);
249267
case AtomicRMWInst::Sub:
250268
return B.CreateBinOp(Instruction::Sub, LHS, RHS);
269+
case AtomicRMWInst::And:
270+
return B.CreateBinOp(Instruction::And, LHS, RHS);
271+
case AtomicRMWInst::Or:
272+
return B.CreateBinOp(Instruction::Or, LHS, RHS);
273+
case AtomicRMWInst::Xor:
274+
return B.CreateBinOp(Instruction::Xor, LHS, RHS);
251275

252276
case AtomicRMWInst::Max:
253277
Pred = CmpInst::ICMP_SGT;
@@ -273,8 +297,11 @@ static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
273297
llvm_unreachable("Unhandled atomic op");
274298
case AtomicRMWInst::Add:
275299
case AtomicRMWInst::Sub:
300+
case AtomicRMWInst::Or:
301+
case AtomicRMWInst::Xor:
276302
case AtomicRMWInst::UMax:
277303
return APInt::getMinValue(BitWidth);
304+
case AtomicRMWInst::And:
278305
case AtomicRMWInst::UMin:
279306
return APInt::getMaxValue(BitWidth);
280307
case AtomicRMWInst::Max:
@@ -340,10 +367,10 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
340367
Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
341368
CallInst *const PartialMbcnt = B.CreateIntrinsic(
342369
Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
343-
CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
344-
{ExtractHi, PartialMbcnt});
345-
346-
Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
370+
Value *const Mbcnt =
371+
B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
372+
{ExtractHi, PartialMbcnt}),
373+
Ty, false);
347374

348375
Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
349376

@@ -417,32 +444,39 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
417444

418445
case AtomicRMWInst::Add:
419446
case AtomicRMWInst::Sub: {
420-
// Get the total number of active lanes we have by using popcount.
421-
Instruction *const Ctpop =
422-
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
423-
Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
424-
425-
// Calculate the new value we will be contributing to the atomic operation
426-
// for the entire wavefront.
427-
NewV = B.CreateMul(V, CtpopCast);
447+
// The new value we will be contributing to the atomic operation is the
448+
// old value times the number of active lanes.
449+
Value *const Ctpop = B.CreateIntCast(
450+
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
451+
NewV = B.CreateMul(V, Ctpop);
428452
break;
429453
}
430454

455+
case AtomicRMWInst::And:
456+
case AtomicRMWInst::Or:
431457
case AtomicRMWInst::Max:
432458
case AtomicRMWInst::Min:
433459
case AtomicRMWInst::UMax:
434460
case AtomicRMWInst::UMin:
435-
// Max/min with a uniform value is idempotent: doing the atomic operation
436-
// multiple times has the same effect as doing it once.
461+
// These operations with a uniform value are idempotent: doing the atomic
462+
// operation multiple times has the same effect as doing it once.
437463
NewV = V;
438464
break;
465+
466+
case AtomicRMWInst::Xor:
467+
// The new value we will be contributing to the atomic operation is the
468+
// old value times the parity of the number of active lanes.
469+
Value *const Ctpop = B.CreateIntCast(
470+
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
471+
NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
472+
break;
439473
}
440474
}
441475

442476
// We only want a single lane to enter our new control flow, and we do this
443477
// by checking if there are any active lanes below us. Only one lane will
444478
// have 0 active lanes below us, so that will be the only one to progress.
445-
Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));
479+
Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
446480

447481
// Store I's original basic block before we split the block.
448482
BasicBlock *const EntryBB = I.getParent();
@@ -511,14 +545,19 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
511545
llvm_unreachable("Unhandled atomic op");
512546
case AtomicRMWInst::Add:
513547
case AtomicRMWInst::Sub:
514-
LaneOffset = B.CreateMul(V, MbcntCast);
548+
LaneOffset = B.CreateMul(V, Mbcnt);
515549
break;
550+
case AtomicRMWInst::And:
551+
case AtomicRMWInst::Or:
516552
case AtomicRMWInst::Max:
517553
case AtomicRMWInst::Min:
518554
case AtomicRMWInst::UMax:
519555
case AtomicRMWInst::UMin:
520556
LaneOffset = B.CreateSelect(Cond, Identity, V);
521557
break;
558+
case AtomicRMWInst::Xor:
559+
LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
560+
break;
522561
}
523562
}
524563
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);

test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,42 @@ entry:
193193
ret void
194194
}
195195

196+
; GCN-LABEL: and_i32_varying:
197+
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
198+
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
199+
; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
200+
define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
201+
entry:
202+
%lane = call i32 @llvm.amdgcn.workitem.id.x()
203+
%old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
204+
store i32 %old, i32 addrspace(1)* %out
205+
ret void
206+
}
207+
208+
; GCN-LABEL: or_i32_varying:
209+
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
210+
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
211+
; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
212+
define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
213+
entry:
214+
%lane = call i32 @llvm.amdgcn.workitem.id.x()
215+
%old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
216+
store i32 %old, i32 addrspace(1)* %out
217+
ret void
218+
}
219+
220+
; GCN-LABEL: xor_i32_varying:
221+
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
222+
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
223+
; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
224+
define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
225+
entry:
226+
%lane = call i32 @llvm.amdgcn.workitem.id.x()
227+
%old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
228+
store i32 %old, i32 addrspace(1)* %out
229+
ret void
230+
}
231+
196232
; GCN-LABEL: max_i32_varying:
197233
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
198234
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]

0 commit comments

Comments
 (0)