Skip to content

Commit cd33c6b

Browse files
authored
[SPARC] Weaken emitted barriers for atomic ops (#154950)
Weaken barriers for atomic ops to the form that's just enough to enforce memory model constraints. In particular, we try to avoid emitting expensive #StoreLoad barriers whenever possible. The barriers emitted conform to V9's RMO and V8's PSO memory model, and is compatible with GCC's lowering. A quick test with `pgbench` on a T4-1 shows some small (up to about 4%), but consistent speedup.
1 parent 3af95f0 commit cd33c6b

File tree

8 files changed

+555
-32
lines changed

8 files changed

+555
-32
lines changed

llvm/lib/Target/Sparc/SparcAsmPrinter.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,15 @@ void SparcAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
326326
void SparcAsmPrinter::emitInstruction(const MachineInstr *MI) {
327327
Sparc_MC::verifyInstructionPredicates(MI->getOpcode(),
328328
getSubtargetInfo().getFeatureBits());
329+
if (MI->isBundle()) {
330+
const MachineBasicBlock *MBB = MI->getParent();
331+
MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
332+
while (I != MBB->instr_end() && I->isInsideBundle()) {
333+
emitInstruction(&*I);
334+
++I;
335+
}
336+
return;
337+
}
329338

330339
switch (MI->getOpcode()) {
331340
default: break;

llvm/lib/Target/Sparc/SparcISelLowering.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "llvm/IR/DerivedTypes.h"
3434
#include "llvm/IR/DiagnosticInfo.h"
3535
#include "llvm/IR/Function.h"
36+
#include "llvm/IR/IRBuilder.h"
3637
#include "llvm/IR/Module.h"
3738
#include "llvm/Support/ErrorHandling.h"
3839
#include "llvm/Support/KnownBits.h"
@@ -3557,3 +3558,28 @@ void SparcTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
35573558
if (!Node->hasAnyUseOfValue(0))
35583559
MI.getOperand(0).setReg(SP::G0);
35593560
}
3561+
3562+
Instruction *SparcTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
3563+
Instruction *Inst,
3564+
AtomicOrdering Ord) const {
3565+
bool HasStoreSemantics =
3566+
isa<AtomicCmpXchgInst, AtomicRMWInst, StoreInst>(Inst);
3567+
if (HasStoreSemantics && isReleaseOrStronger(Ord))
3568+
return Builder.CreateFence(AtomicOrdering::Release);
3569+
return nullptr;
3570+
}
3571+
3572+
Instruction *SparcTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
3573+
Instruction *Inst,
3574+
AtomicOrdering Ord) const {
3575+
// V8 loads already come with implicit acquire barrier so there's no need to
3576+
// emit it again.
3577+
bool HasLoadSemantics = isa<AtomicCmpXchgInst, AtomicRMWInst, LoadInst>(Inst);
3578+
if (Subtarget->isV9() && HasLoadSemantics && isAcquireOrStronger(Ord))
3579+
return Builder.CreateFence(AtomicOrdering::Acquire);
3580+
3581+
// SC plain stores would need a trailing full barrier.
3582+
if (isa<StoreInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
3583+
return Builder.CreateFence(Ord);
3584+
return nullptr;
3585+
}

llvm/lib/Target/Sparc/SparcISelLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,11 @@ namespace llvm {
183183
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
184184
EVT VT) const override;
185185

186+
Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
187+
AtomicOrdering Ord) const override;
188+
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
189+
AtomicOrdering Ord) const override;
190+
186191
bool shouldInsertFencesForAtomic(const Instruction *I) const override {
187192
// FIXME: We insert fences for each atomics and generate
188193
// sub-optimal code for PSO/TSO. (Approximately nobody uses any

llvm/lib/Target/Sparc/SparcInstrInfo.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,22 @@ bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
653653
.addImm(Offset);
654654
return true;
655655
}
656+
case SP::V8BAR: {
657+
assert(!Subtarget.isV9() &&
658+
"V8BAR should not be emitted on V9 processors!");
659+
660+
// Emit stbar; ldstub [%sp-1], %g0
661+
// The sequence acts as a full barrier on V8 systems.
662+
MachineBasicBlock &MBB = *MI.getParent();
663+
MachineInstr &InstSTBAR =
664+
*BuildMI(MBB, MI, MI.getDebugLoc(), get(SP::STBAR));
665+
MachineInstr &InstLDSTUB =
666+
*BuildMI(MBB, MI, MI.getDebugLoc(), get(SP::LDSTUBri), SP::G0)
667+
.addReg(SP::O6)
668+
.addImm(-1);
669+
MIBundleBuilder(MBB, InstSTBAR, InstLDSTUB);
670+
MBB.erase(MI);
671+
}
656672
}
657673
return false;
658674
}

llvm/lib/Target/Sparc/SparcInstrInfo.td

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,9 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
578578
let isPseudo = 1;
579579
}
580580

581+
// Full memory barrier for V8.
582+
def V8BAR : Pseudo<(outs), (ins), "!V8BAR", []>, Requires<[HasNoV9]>;
583+
581584
// GETPCX for PIC
582585
let Defs = [O7] in {
583586
def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >;
@@ -1974,12 +1977,30 @@ def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
19741977
def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>;
19751978
def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
19761979

1977-
// store bar for all atomic_fence in V8.
1978-
let Predicates = [HasNoV9] in
1979-
def : Pat<(atomic_fence timm, timm), (STBAR)>;
1980+
// All load-type operations in V8 comes with implicit acquire semantics.
1981+
let Predicates = [HasNoV9] in {
1982+
// Acquire -> nop
1983+
def : Pat<(atomic_fence (i32 4), timm), (NOP)>;
1984+
// Release / AcqRel -> stbar
1985+
def : Pat<(atomic_fence (i32 5), timm), (STBAR)>;
1986+
// AcqRel and stronger -> stbar; ldstub [%sp-1], %g0
1987+
def : Pat<(atomic_fence timm, timm), (V8BAR)>;
1988+
}
19801989

1981-
let Predicates = [HasV9] in
1990+
// We have to handle both 32 and 64-bit cases.
1991+
let Predicates = [HasV9] in {
1992+
// Acquire -> membar #LoadLoad | #LoadStore
1993+
def : Pat<(atomic_fence (i32 4), timm), (MEMBARi 0x5)>;
1994+
def : Pat<(atomic_fence (i64 4), timm), (MEMBARi 0x5)>;
1995+
// Release -> membar #LoadStore | #StoreStore
1996+
def : Pat<(atomic_fence (i32 5), timm), (MEMBARi 0xc)>;
1997+
def : Pat<(atomic_fence (i64 5), timm), (MEMBARi 0xc)>;
1998+
// AcqRel -> membar #LoadLoad | #LoadStore | #StoreStore
1999+
def : Pat<(atomic_fence (i32 6), timm), (MEMBARi 0xd)>;
2000+
def : Pat<(atomic_fence (i64 6), timm), (MEMBARi 0xd)>;
2001+
// SeqCst -> membar #StoreLoad | #LoadLoad | #LoadStore | #StoreStore
19822002
def : Pat<(atomic_fence timm, timm), (MEMBARi 0xf)>;
2003+
}
19832004

19842005
// atomic_load addr -> load addr
19852006
def : Pat<(i32 (atomic_load_azext_8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;

llvm/test/CodeGen/SPARC/atomicrmw-uinc-udec-wrap.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
55
; CHECK-LABEL: atomicrmw_uinc_wrap_i8:
66
; CHECK: .cfi_startproc
77
; CHECK-NEXT: ! %bb.0:
8-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
8+
; CHECK-NEXT: membar #LoadStore | #StoreStore
99
; CHECK-NEXT: and %o0, -4, %o2
1010
; CHECK-NEXT: mov 3, %o3
1111
; CHECK-NEXT: andn %o3, %o0, %o0
@@ -36,7 +36,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
3636
; CHECK-NEXT: nop
3737
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
3838
; CHECK-NEXT: srl %o4, %o0, %o0
39-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
39+
; CHECK-NEXT: membar #LoadLoad | #LoadStore
4040
; CHECK-NEXT: retl
4141
; CHECK-NEXT: nop
4242
%result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
@@ -47,7 +47,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
4747
; CHECK-LABEL: atomicrmw_uinc_wrap_i16:
4848
; CHECK: .cfi_startproc
4949
; CHECK-NEXT: ! %bb.0:
50-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
50+
; CHECK-NEXT: membar #LoadStore | #StoreStore
5151
; CHECK-NEXT: and %o0, -4, %o2
5252
; CHECK-NEXT: and %o0, 3, %o0
5353
; CHECK-NEXT: xor %o0, 2, %o0
@@ -79,7 +79,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
7979
; CHECK-NEXT: nop
8080
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
8181
; CHECK-NEXT: srl %o5, %o0, %o0
82-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
82+
; CHECK-NEXT: membar #LoadLoad | #LoadStore
8383
; CHECK-NEXT: retl
8484
; CHECK-NEXT: nop
8585
%result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
@@ -90,7 +90,7 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
9090
; CHECK-LABEL: atomicrmw_uinc_wrap_i32:
9191
; CHECK: .cfi_startproc
9292
; CHECK-NEXT: ! %bb.0:
93-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
93+
; CHECK-NEXT: membar #LoadStore | #StoreStore
9494
; CHECK-NEXT: ld [%o0], %o2
9595
; CHECK-NEXT: .LBB2_1: ! %atomicrmw.start
9696
; CHECK-NEXT: ! =>This Inner Loop Header: Depth=1
@@ -106,7 +106,7 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
106106
; CHECK-NEXT: bne %icc, .LBB2_1
107107
; CHECK-NEXT: nop
108108
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
109-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
109+
; CHECK-NEXT: membar #LoadLoad | #LoadStore
110110
; CHECK-NEXT: retl
111111
; CHECK-NEXT: mov %o2, %o0
112112
%result = atomicrmw uinc_wrap ptr %ptr, i32 %val seq_cst
@@ -160,7 +160,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
160160
; CHECK-LABEL: atomicrmw_udec_wrap_i8:
161161
; CHECK: .cfi_startproc
162162
; CHECK-NEXT: ! %bb.0:
163-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
163+
; CHECK-NEXT: membar #LoadStore | #StoreStore
164164
; CHECK-NEXT: and %o0, -4, %o2
165165
; CHECK-NEXT: mov 3, %o3
166166
; CHECK-NEXT: andn %o3, %o0, %o0
@@ -193,7 +193,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
193193
; CHECK-NEXT: nop
194194
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
195195
; CHECK-NEXT: srl %o5, %o0, %o0
196-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
196+
; CHECK-NEXT: membar #LoadLoad | #LoadStore
197197
; CHECK-NEXT: retl
198198
; CHECK-NEXT: nop
199199
%result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
@@ -204,7 +204,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
204204
; CHECK-LABEL: atomicrmw_udec_wrap_i16:
205205
; CHECK: .cfi_startproc
206206
; CHECK-NEXT: ! %bb.0:
207-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
207+
; CHECK-NEXT: membar #LoadStore | #StoreStore
208208
; CHECK-NEXT: and %o0, -4, %o2
209209
; CHECK-NEXT: and %o0, 3, %o0
210210
; CHECK-NEXT: xor %o0, 2, %o0
@@ -238,7 +238,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
238238
; CHECK-NEXT: nop
239239
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
240240
; CHECK-NEXT: srl %g2, %o0, %o0
241-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
241+
; CHECK-NEXT: membar #LoadLoad | #LoadStore
242242
; CHECK-NEXT: retl
243243
; CHECK-NEXT: nop
244244
%result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
@@ -249,7 +249,7 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
249249
; CHECK-LABEL: atomicrmw_udec_wrap_i32:
250250
; CHECK: .cfi_startproc
251251
; CHECK-NEXT: ! %bb.0:
252-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
252+
; CHECK-NEXT: membar #LoadStore | #StoreStore
253253
; CHECK-NEXT: ld [%o0], %o2
254254
; CHECK-NEXT: .LBB6_1: ! %atomicrmw.start
255255
; CHECK-NEXT: ! =>This Inner Loop Header: Depth=1
@@ -267,7 +267,7 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
267267
; CHECK-NEXT: bne %icc, .LBB6_1
268268
; CHECK-NEXT: nop
269269
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
270-
; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
270+
; CHECK-NEXT: membar #LoadLoad | #LoadStore
271271
; CHECK-NEXT: retl
272272
; CHECK-NEXT: mov %o2, %o0
273273
%result = atomicrmw udec_wrap ptr %ptr, i32 %val seq_cst

0 commit comments

Comments
 (0)