-
Couldn't load subscription status.
- Fork 15k
[SPARC] Weaken emitted barriers for atomic ops #154950
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Weaken barriers for atomic ops just to the form that's enough to enforce the constraints, but no stronger. In particular, we try to avoid emitting expensive #StoreLoad barriers whenever possible. The barriers emitted still conform to V9's RMO and V8's PSO memory model, and is compatible with GCC's lowering.
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-sparc Author: Koakuma (koachan) ChangesWeaken barriers for atomic ops to the form that's just enough to enforce memory model constraints. The barriers emitted conform to V9's RMO and V8's PSO memory model, and is compatible with GCC's lowering. A quick test with Full diff: https://github.com/llvm/llvm-project/pull/154950.diff 5 Files Affected:
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index dd221327dbdc6..a926fe56a61ef 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -33,7 +33,10 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
using namespace llvm;
@@ -3562,3 +3565,27 @@ void SparcTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
if (!Node->hasAnyUseOfValue(0))
MI.getOperand(0).setReg(SP::G0);
}
+
+Instruction *SparcTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ bool HasStoreSemantics = isa<AtomicRMWInst>(Inst) || isa<StoreInst>(Inst);
+ if (HasStoreSemantics && isReleaseOrStronger(Ord))
+ return Builder.CreateFence(AtomicOrdering::Release);
+ return nullptr;
+}
+
+Instruction *SparcTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ // V8 loads already come with implicit acquire barrier so there's no need to
+ // emit it again.
+ bool HasLoadSemantics = isa<AtomicRMWInst>(Inst) || isa<LoadInst>(Inst);
+ if (Subtarget->isV9() && HasLoadSemantics && isAcquireOrStronger(Ord))
+ return Builder.CreateFence(AtomicOrdering::Acquire);
+
+ // SC plain stores would need a trailing full barrier.
+ if (isa<StoreInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
+ return Builder.CreateFence(Ord);
+ return nullptr;
+}
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
index 4017beb88ff31..73bd8ff6b24a4 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -182,6 +182,11 @@ namespace llvm {
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const override;
+ Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+
bool shouldInsertFencesForAtomic(const Instruction *I) const override {
// FIXME: We insert fences for each atomics and generate
// sub-optimal code for PSO/TSO. (Approximately nobody uses any
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 1a32eafb0e83d..f427f6bfba63b 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -1957,12 +1957,31 @@ def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>;
def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
-// store bar for all atomic_fence in V8.
-let Predicates = [HasNoV9] in
+// All load-type operations in V8 comes with implicit acquire semantics.
+let Predicates = [HasNoV9] in {
+ // Acquire -> nop
+ def : Pat<(atomic_fence (i32 4), timm), (NOP)>;
+ // Release / AcqRel -> stbar
+ def : Pat<(atomic_fence (i32 5), timm), (STBAR)>;
+ // AcqRel and stronger -> stbar; ldstub [%sp-1], %g0
+ // FIXME how to actually emit the ldstub?
def : Pat<(atomic_fence timm, timm), (STBAR)>;
+}
-let Predicates = [HasV9] in
+// We have to handle both 32 and 64-bit cases.
+let Predicates = [HasV9] in {
+ // Acquire -> membar #LoadLoad | #LoadStore
+ def : Pat<(atomic_fence (i32 4), timm), (MEMBARi 0x5)>;
+ def : Pat<(atomic_fence (i64 4), timm), (MEMBARi 0x5)>;
+ // Release -> membar #LoadStore | #StoreStore
+ def : Pat<(atomic_fence (i32 5), timm), (MEMBARi 0xc)>;
+ def : Pat<(atomic_fence (i64 5), timm), (MEMBARi 0xc)>;
+ // AcqRel -> membar #LoadLoad | #LoadStore | #StoreStore
+ def : Pat<(atomic_fence (i32 6), timm), (MEMBARi 0xd)>;
+ def : Pat<(atomic_fence (i64 6), timm), (MEMBARi 0xd)>;
+ // SeqCst -> membar #StoreLoad | #LoadLoad | #LoadStore | #StoreStore
def : Pat<(atomic_fence timm, timm), (MEMBARi 0xf)>;
+}
// atomic_load addr -> load addr
def : Pat<(i32 (atomic_load_azext_8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
diff --git a/llvm/test/CodeGen/SPARC/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/SPARC/atomicrmw-uinc-udec-wrap.ll
index 380a4a0a6b870..d1f1c46d9b8b1 100644
--- a/llvm/test/CodeGen/SPARC/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/SPARC/atomicrmw-uinc-udec-wrap.ll
@@ -5,7 +5,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
; CHECK-LABEL: atomicrmw_uinc_wrap_i8:
; CHECK: .cfi_startproc
; CHECK-NEXT: ! %bb.0:
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadStore | #StoreStore
; CHECK-NEXT: and %o0, -4, %o2
; CHECK-NEXT: mov 3, %o3
; CHECK-NEXT: andn %o3, %o0, %o0
@@ -36,7 +36,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
; CHECK-NEXT: nop
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
; CHECK-NEXT: srl %o4, %o0, %o0
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadLoad | #LoadStore
; CHECK-NEXT: retl
; CHECK-NEXT: nop
%result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
@@ -47,7 +47,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
; CHECK-LABEL: atomicrmw_uinc_wrap_i16:
; CHECK: .cfi_startproc
; CHECK-NEXT: ! %bb.0:
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadStore | #StoreStore
; CHECK-NEXT: and %o0, -4, %o2
; CHECK-NEXT: and %o0, 3, %o0
; CHECK-NEXT: xor %o0, 2, %o0
@@ -79,7 +79,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
; CHECK-NEXT: nop
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
; CHECK-NEXT: srl %o5, %o0, %o0
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadLoad | #LoadStore
; CHECK-NEXT: retl
; CHECK-NEXT: nop
%result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
@@ -90,7 +90,7 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
; CHECK-LABEL: atomicrmw_uinc_wrap_i32:
; CHECK: .cfi_startproc
; CHECK-NEXT: ! %bb.0:
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadStore | #StoreStore
; CHECK-NEXT: ld [%o0], %o2
; CHECK-NEXT: .LBB2_1: ! %atomicrmw.start
; CHECK-NEXT: ! =>This Inner Loop Header: Depth=1
@@ -106,7 +106,7 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
; CHECK-NEXT: bne %icc, .LBB2_1
; CHECK-NEXT: nop
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadLoad | #LoadStore
; CHECK-NEXT: retl
; CHECK-NEXT: mov %o2, %o0
%result = atomicrmw uinc_wrap ptr %ptr, i32 %val seq_cst
@@ -160,7 +160,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
; CHECK-LABEL: atomicrmw_udec_wrap_i8:
; CHECK: .cfi_startproc
; CHECK-NEXT: ! %bb.0:
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadStore | #StoreStore
; CHECK-NEXT: and %o0, -4, %o2
; CHECK-NEXT: mov 3, %o3
; CHECK-NEXT: andn %o3, %o0, %o0
@@ -193,7 +193,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
; CHECK-NEXT: nop
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
; CHECK-NEXT: srl %o5, %o0, %o0
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadLoad | #LoadStore
; CHECK-NEXT: retl
; CHECK-NEXT: nop
%result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
@@ -204,7 +204,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
; CHECK-LABEL: atomicrmw_udec_wrap_i16:
; CHECK: .cfi_startproc
; CHECK-NEXT: ! %bb.0:
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadStore | #StoreStore
; CHECK-NEXT: and %o0, -4, %o2
; CHECK-NEXT: and %o0, 3, %o0
; CHECK-NEXT: xor %o0, 2, %o0
@@ -238,7 +238,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
; CHECK-NEXT: nop
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
; CHECK-NEXT: srl %g2, %o0, %o0
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadLoad | #LoadStore
; CHECK-NEXT: retl
; CHECK-NEXT: nop
%result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
@@ -249,7 +249,7 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
; CHECK-LABEL: atomicrmw_udec_wrap_i32:
; CHECK: .cfi_startproc
; CHECK-NEXT: ! %bb.0:
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadStore | #StoreStore
; CHECK-NEXT: ld [%o0], %o2
; CHECK-NEXT: .LBB6_1: ! %atomicrmw.start
; CHECK-NEXT: ! =>This Inner Loop Header: Depth=1
@@ -267,7 +267,7 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
; CHECK-NEXT: bne %icc, .LBB6_1
; CHECK-NEXT: nop
; CHECK-NEXT: ! %bb.2: ! %atomicrmw.end
-; CHECK-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; CHECK-NEXT: membar #LoadLoad | #LoadStore
; CHECK-NEXT: retl
; CHECK-NEXT: mov %o2, %o0
%result = atomicrmw udec_wrap ptr %ptr, i32 %val seq_cst
diff --git a/llvm/test/CodeGen/SPARC/atomics-ordering.ll b/llvm/test/CodeGen/SPARC/atomics-ordering.ll
new file mode 100644
index 0000000000000..25a370b325302
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/atomics-ordering.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=sparc -verify-machineinstrs | FileCheck %s --check-prefixes=SPARC32
+; RUN: llc < %s -mtriple=sparc -mcpu=leon4 -verify-machineinstrs | FileCheck %s --check-prefixes=SPARC32-LEON4
+; RUN: llc < %s -mtriple=sparc -mcpu=v9 -verify-machineinstrs | FileCheck %s --check-prefixes=SPARC32-V9
+; RUN: llc < %s -mtriple=sparcv9 -verify-machineinstrs | FileCheck %s --check-prefixes=SPARC64
+
+define i32 @load_acq(ptr %0) nounwind {
+; SPARC32-LABEL: load_acq:
+; SPARC32: ! %bb.0:
+; SPARC32-NEXT: save %sp, -96, %sp
+; SPARC32-NEXT: mov %i0, %o0
+; SPARC32-NEXT: call __atomic_load_4
+; SPARC32-NEXT: mov 2, %o1
+; SPARC32-NEXT: ret
+; SPARC32-NEXT: restore %g0, %o0, %o0
+;
+; SPARC32-LEON4-LABEL: load_acq:
+; SPARC32-LEON4: ! %bb.0:
+; SPARC32-LEON4-NEXT: retl
+; SPARC32-LEON4-NEXT: ld [%o0], %o0
+;
+; SPARC32-V9-LABEL: load_acq:
+; SPARC32-V9: ! %bb.0:
+; SPARC32-V9-NEXT: ld [%o0], %o0
+; SPARC32-V9-NEXT: membar #LoadLoad | #LoadStore
+; SPARC32-V9-NEXT: retl
+; SPARC32-V9-NEXT: nop
+;
+; SPARC64-LABEL: load_acq:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: ld [%o0], %o0
+; SPARC64-NEXT: membar #LoadLoad | #LoadStore
+; SPARC64-NEXT: retl
+; SPARC64-NEXT: nop
+ %2 = load atomic i32, ptr %0 acquire, align 4
+ ret i32 %2
+}
+
+define i32 @load_sc(ptr %0) nounwind {
+; SPARC32-LABEL: load_sc:
+; SPARC32: ! %bb.0:
+; SPARC32-NEXT: save %sp, -96, %sp
+; SPARC32-NEXT: mov %i0, %o0
+; SPARC32-NEXT: call __atomic_load_4
+; SPARC32-NEXT: mov 5, %o1
+; SPARC32-NEXT: ret
+; SPARC32-NEXT: restore %g0, %o0, %o0
+;
+; SPARC32-LEON4-LABEL: load_sc:
+; SPARC32-LEON4: ! %bb.0:
+; SPARC32-LEON4-NEXT: retl
+; SPARC32-LEON4-NEXT: ld [%o0], %o0
+;
+; SPARC32-V9-LABEL: load_sc:
+; SPARC32-V9: ! %bb.0:
+; SPARC32-V9-NEXT: ld [%o0], %o0
+; SPARC32-V9-NEXT: membar #LoadLoad | #LoadStore
+; SPARC32-V9-NEXT: retl
+; SPARC32-V9-NEXT: nop
+;
+; SPARC64-LABEL: load_sc:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: ld [%o0], %o0
+; SPARC64-NEXT: membar #LoadLoad | #LoadStore
+; SPARC64-NEXT: retl
+; SPARC64-NEXT: nop
+ %2 = load atomic i32, ptr %0 seq_cst, align 4
+ ret i32 %2
+}
+
+define void @store_rel(ptr %0, i32 %1) nounwind {
+; SPARC32-LABEL: store_rel:
+; SPARC32: ! %bb.0:
+; SPARC32-NEXT: save %sp, -96, %sp
+; SPARC32-NEXT: mov %i1, %o1
+; SPARC32-NEXT: mov %i0, %o0
+; SPARC32-NEXT: call __atomic_store_4
+; SPARC32-NEXT: mov 3, %o2
+; SPARC32-NEXT: ret
+; SPARC32-NEXT: restore
+;
+; SPARC32-LEON4-LABEL: store_rel:
+; SPARC32-LEON4: ! %bb.0:
+; SPARC32-LEON4-NEXT: stbar
+; SPARC32-LEON4-NEXT: retl
+; SPARC32-LEON4-NEXT: st %o1, [%o0]
+;
+; SPARC32-V9-LABEL: store_rel:
+; SPARC32-V9: ! %bb.0:
+; SPARC32-V9-NEXT: membar #LoadStore | #StoreStore
+; SPARC32-V9-NEXT: retl
+; SPARC32-V9-NEXT: st %o1, [%o0]
+;
+; SPARC64-LABEL: store_rel:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: membar #LoadStore | #StoreStore
+; SPARC64-NEXT: retl
+; SPARC64-NEXT: st %o1, [%o0]
+ store atomic i32 %1, ptr %0 release, align 4
+ ret void
+}
+
+define void @store_sc(ptr %0, i32 %1) nounwind {
+; SPARC32-LABEL: store_sc:
+; SPARC32: ! %bb.0:
+; SPARC32-NEXT: save %sp, -96, %sp
+; SPARC32-NEXT: mov %i1, %o1
+; SPARC32-NEXT: mov %i0, %o0
+; SPARC32-NEXT: call __atomic_store_4
+; SPARC32-NEXT: mov 5, %o2
+; SPARC32-NEXT: ret
+; SPARC32-NEXT: restore
+;
+; SPARC32-LEON4-LABEL: store_sc:
+; SPARC32-LEON4: ! %bb.0:
+; SPARC32-LEON4-NEXT: stbar
+; SPARC32-LEON4-NEXT: st %o1, [%o0]
+; SPARC32-LEON4-NEXT: stbar
+; SPARC32-LEON4-NEXT: retl
+; SPARC32-LEON4-NEXT: nop
+;
+; SPARC32-V9-LABEL: store_sc:
+; SPARC32-V9: ! %bb.0:
+; SPARC32-V9-NEXT: membar #LoadStore | #StoreStore
+; SPARC32-V9-NEXT: st %o1, [%o0]
+; SPARC32-V9-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; SPARC32-V9-NEXT: retl
+; SPARC32-V9-NEXT: nop
+;
+; SPARC64-LABEL: store_sc:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: membar #LoadStore | #StoreStore
+; SPARC64-NEXT: st %o1, [%o0]
+; SPARC64-NEXT: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+; SPARC64-NEXT: retl
+; SPARC64-NEXT: nop
+ store atomic i32 %1, ptr %0 seq_cst, align 4
+ ret void
+}
+
+define i32 @rmw_acq(ptr %0, i32 %1) nounwind {
+; SPARC32-LABEL: rmw_acq:
+; SPARC32: ! %bb.0:
+; SPARC32-NEXT: save %sp, -96, %sp
+; SPARC32-NEXT: mov %i1, %o1
+; SPARC32-NEXT: mov %i0, %o0
+; SPARC32-NEXT: call __atomic_exchange_4
+; SPARC32-NEXT: mov 2, %o2
+; SPARC32-NEXT: ret
+; SPARC32-NEXT: restore %g0, %o0, %o0
+;
+; SPARC32-LEON4-LABEL: rmw_acq:
+; SPARC32-LEON4: ! %bb.0:
+; SPARC32-LEON4-NEXT: swap [%o0], %o1
+; SPARC32-LEON4-NEXT: retl
+; SPARC32-LEON4-NEXT: mov %o1, %o0
+;
+; SPARC32-V9-LABEL: rmw_acq:
+; SPARC32-V9: ! %bb.0:
+; SPARC32-V9-NEXT: swap [%o0], %o1
+; SPARC32-V9-NEXT: membar #LoadLoad | #LoadStore
+; SPARC32-V9-NEXT: retl
+; SPARC32-V9-NEXT: mov %o1, %o0
+;
+; SPARC64-LABEL: rmw_acq:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: swap [%o0], %o1
+; SPARC64-NEXT: membar #LoadLoad | #LoadStore
+; SPARC64-NEXT: retl
+; SPARC64-NEXT: mov %o1, %o0
+ %3 = atomicrmw xchg ptr %0, i32 %1 acquire, align 4
+ ret i32 %3
+}
+
+define i32 @rmw_rel(ptr %0, i32 %1) nounwind {
+; SPARC32-LABEL: rmw_rel:
+; SPARC32: ! %bb.0:
+; SPARC32-NEXT: save %sp, -96, %sp
+; SPARC32-NEXT: mov %i1, %o1
+; SPARC32-NEXT: mov %i0, %o0
+; SPARC32-NEXT: call __atomic_exchange_4
+; SPARC32-NEXT: mov 3, %o2
+; SPARC32-NEXT: ret
+; SPARC32-NEXT: restore %g0, %o0, %o0
+;
+; SPARC32-LEON4-LABEL: rmw_rel:
+; SPARC32-LEON4: ! %bb.0:
+; SPARC32-LEON4-NEXT: stbar
+; SPARC32-LEON4-NEXT: swap [%o0], %o1
+; SPARC32-LEON4-NEXT: retl
+; SPARC32-LEON4-NEXT: mov %o1, %o0
+;
+; SPARC32-V9-LABEL: rmw_rel:
+; SPARC32-V9: ! %bb.0:
+; SPARC32-V9-NEXT: membar #LoadStore | #StoreStore
+; SPARC32-V9-NEXT: swap [%o0], %o1
+; SPARC32-V9-NEXT: retl
+; SPARC32-V9-NEXT: mov %o1, %o0
+;
+; SPARC64-LABEL: rmw_rel:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: membar #LoadStore | #StoreStore
+; SPARC64-NEXT: swap [%o0], %o1
+; SPARC64-NEXT: retl
+; SPARC64-NEXT: mov %o1, %o0
+ %3 = atomicrmw xchg ptr %0, i32 %1 release, align 4
+ ret i32 %3
+}
+
+define i32 @rmw_acq_rel(ptr %0, i32 %1) nounwind {
+; SPARC32-LABEL: rmw_acq_rel:
+; SPARC32: ! %bb.0:
+; SPARC32-NEXT: save %sp, -96, %sp
+; SPARC32-NEXT: mov %i1, %o1
+; SPARC32-NEXT: mov %i0, %o0
+; SPARC32-NEXT: call __atomic_exchange_4
+; SPARC32-NEXT: mov 4, %o2
+; SPARC32-NEXT: ret
+; SPARC32-NEXT: restore %g0, %o0, %o0
+;
+; SPARC32-LEON4-LABEL: rmw_acq_rel:
+; SPARC32-LEON4: ! %bb.0:
+; SPARC32-LEON4-NEXT: stbar
+; SPARC32-LEON4-NEXT: swap [%o0], %o1
+; SPARC32-LEON4-NEXT: retl
+; SPARC32-LEON4-NEXT: mov %o1, %o0
+;
+; SPARC32-V9-LABEL: rmw_acq_rel:
+; SPARC32-V9: ! %bb.0:
+; SPARC32-V9-NEXT: membar #LoadStore | #StoreStore
+; SPARC32-V9-NEXT: swap [%o0], %o1
+; SPARC32-V9-NEXT: membar #LoadLoad | #LoadStore
+; SPARC32-V9-NEXT: retl
+; SPARC32-V9-NEXT: mov %o1, %o0
+;
+; SPARC64-LABEL: rmw_acq_rel:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: membar #LoadStore | #StoreStore
+; SPARC64-NEXT: swap [%o0], %o1
+; SPARC64-NEXT: membar #LoadLoad | #LoadStore
+; SPARC64-NEXT: retl
+; SPARC64-NEXT: mov %o1, %o0
+ %3 = atomicrmw xchg ptr %0, i32 %1 acq_rel, align 4
+ ret i32 %3
+}
+
+define i32 @rmw_sc(ptr %0, i32 %1) nounwind {
+; SPARC32-LABEL: rmw_sc:
+; SPARC32: ! %bb.0:
+; SPARC32-NEXT: save %sp, -96, %sp
+; SPARC32-NEXT: mov %i1, %o1
+; SPARC32-NEXT: mov %i0, %o0
+; SPARC32-NEXT: call __atomic_exchange_4
+; SPARC32-NEXT: mov 5, %o2
+; SPARC32-NEXT: ret
+; SPARC32-NEXT: restore %g0, %o0, %o0
+;
+; SPARC32-LEON4-LABEL: rmw_sc:
+; SPARC32-LEON4: ! %bb.0:
+; SPARC32-LEON4-NEXT: stbar
+; SPARC32-LEON4-NEXT: swap [%o0], %o1
+; SPARC32-LEON4-NEXT: retl
+; SPARC32-LEON4-NEXT: mov %o1, %o0
+;
+; SPARC32-V9-LABEL: rmw_sc:
+; SPARC32-V9: ! %bb.0:
+; SPARC32-V9-NEXT: membar #LoadStore | #StoreStore
+; SPARC32-V9-NEXT: swap [%o0], %o1
+; SPARC32-V9-NEXT: membar #LoadLoad | #LoadStore
+; SPARC32-V9-NEXT: retl
+; SPARC32-V9-NEXT: mov %o1, %o0
+;
+; SPARC64-LABEL: rmw_sc:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: membar #LoadStore | #StoreStore
+; SPARC64-NEXT: swap [%o0], %o1
+; SPARC64-NEXT: membar #LoadLoad | #LoadStore
+; SPARC64-NEXT: retl
+; SPARC64-NEXT: mov %o1, %o0
+ %3 = atomicrmw xchg ptr %0, i32 %1 seq_cst, align 4
+ ret i32 %3
+}
|
| // AcqRel and stronger -> stbar; ldstub [%sp-1], %g0 | ||
| // FIXME how to actually emit the ldstub? | ||
| def : Pat<(atomic_fence timm, timm), (STBAR)>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently LLVM emits buggy SC barriers on V8; it should be a two instruction sequence but it only emits the stbar without the following ldstub, and I haven't figured out to convince it to emit the ldstub too.
Any advice of what should I do here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AtomicExpand pass has a number of callbacks, probably we need to implement more of them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I implemented the barrier as a Pseudo, it appears to emit the correct sequence.
|
Now that the V8 barrier issue is resolved, I think this is ready for merging. |
| // Emit stbar; ldstub [%sp-1], %g0 | ||
| // The sequence acts as a full barrier on V8 systems. | ||
| STBARInst.setOpcode(SP::STBAR); | ||
| LDSTUBInst.setOpcode(SP::LDSTUBri); | ||
| LDSTUBInst.addOperand(MCOperand::createReg(SP::G0)); | ||
| LDSTUBInst.addOperand(MCOperand::createReg(SP::O6)); | ||
| LDSTUBInst.addOperand(MCOperand::createImm(-1)); | ||
|
|
||
| OutStreamer->emitInstruction(STBARInst, STI); | ||
| OutStreamer->emitInstruction(LDSTUBInst, STI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Expanding the pseudo in MC is highly unusual and probably a bad idea. At the latest this should be a post-RA pseudo expanded into a bundle
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(that's assuming these actually need to be kept immediately sequential)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, moved the expansion. Is it correct now?
|
cc @arsenm |
|
Ping? |
1 similar comment
|
Ping? |
|
cc @arsenm |
| *BuildMI(MBB, MI, MI.getDebugLoc(), get(SP::LDSTUBri), SP::G0) | ||
| .addReg(SP::O6) | ||
| .addImm(-1); | ||
| MIBundleBuilder(MBB, InstSTBAR, InstLDSTUB); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| MIBundleBuilder(MBB, InstSTBAR, InstLDSTUB); | |
| finalizeBundle(MBB, InstSTBAR, InstLDSTUB); |
I'm not sure what MIBundleBuilder is for, but I think you need to use finalizeBundle to get this properly prefixed with a BUNDLE instruction
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
to get this properly prefixed with a BUNDLE instruction
A question: the BUNDLE instruction isn't supposed to show up in the asm output right? Because after I changed it to follow your suggestion, the test fails like this:
atomics-ordering.ll:118:23: error: SPARC32-LEON4-NEXT: is not on the line after the previous match
; SPARC32-LEON4-NEXT: stbar
^
<stdin>:42:2: note: 'next' match was here
stbar
^
<stdin>:40:15: note: previous match ended here
st %o1, [%o0]
^
<stdin>:41:1: note: non-matching line after previous match is here
BUNDLE
^
Input file: <stdin>
Check file: /home/k/llvm-debug/src/llvm-project/llvm/test/CodeGen/SPARC/atomics-ordering.ll
-dump-input=help explains the following input dump.
Input was:
<<<<<<
.
.
.
37: store_sc: ! @store_sc
38: ! %bb.0:
39: stbar
40: st %o1, [%o0]
41: BUNDLE
42: stbar
next:118 !~~~~ error: match on wrong line
43: ldstub [%sp+-1], %g0
44: retl
45: nop
46: .Lfunc_end3:
47: .size store_sc, .Lfunc_end3-store_sc
.
.
.
>>>>>>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The mc lowering should be skipping it if it's not really a bundle
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, I dunno what do you mean by MC lowering here, sorry
In my understanding after doing finalizeBundle the stbar; ldstub sequence will be prefixed by a BUNDLE instruction, but then a later pass ought to remove the BUNDLE prefix right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ping @arsenm?
Still confused what should I do here for proper bundlemaking...
| AtomicOrdering Ord) const { | ||
| // V8 loads already come with implicit acquire barrier so there's no need to | ||
| // emit it again. | ||
| bool HasLoadSemantics = isa<AtomicRMWInst>(Inst) || isa<LoadInst>(Inst); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about cmpxchg? Comment if it is intended to not be handled
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wait, isn't cmpxchg already included in AtomicRMWInst? Do I need to add another check for opcodes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I see there really is a AtomicCmpXchgInst. Adding it.
| Instruction *SparcTargetLowering::emitLeadingFence(IRBuilderBase &Builder, | ||
| Instruction *Inst, | ||
| AtomicOrdering Ord) const { | ||
| bool HasStoreSemantics = isa<AtomicRMWInst>(Inst) || isa<StoreInst>(Inst); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about cmpxchg? Comment if it is intended to not be handled
| ; SPARC64-NEXT: mov %o1, %o0 | ||
| %3 = atomicrmw xchg ptr %0, i32 %1 seq_cst, align 4 | ||
| ret i32 %3 | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cmpxchg not covered in the tests
|
One of the buildbots is failing with CodeGen/SPARC/atomics-ordering.ll after this change (https://lab.llvm.org/buildbot/#/builders/24/builds/13433). Could you please take a look? The failing case is |
…dled Buildbot report: https://lab.llvm.org/buildbot/#/builders/24/builds/13433 Use-after-poison happens because after SP::V8BAR is handled, it erases MI, which should thereafter not be inspected by ExpandPostRA::run.
…#162424) Use-after-poison happens because after SP::V8BAR is handled, it erases MI, which therefore should not be inspected by `ExpandPostRA::run()`. This fixes a buildbot-reported issue from #154950 (https://lab.llvm.org/buildbot/#/builders/24/builds/13433).
Weaken barriers for atomic ops to the form that's just enough to enforce memory model constraints. In particular, we try to avoid emitting expensive #StoreLoad barriers whenever possible. The barriers emitted conform to V9's RMO and V8's PSO memory model, and is compatible with GCC's lowering. A quick test with `pgbench` on a T4-1 shows some small (up to about 4%), but consistent speedup.
…#162424) Use-after-poison happens because after SP::V8BAR is handled, it erases MI, which therefore should not be inspected by `ExpandPostRA::run()`. This fixes a buildbot-reported issue from #154950 (https://lab.llvm.org/buildbot/#/builders/24/builds/13433).
Weaken barriers for atomic ops to the form that's just enough to enforce memory model constraints.
In particular, we try to avoid emitting expensive #StoreLoad barriers whenever possible.
The barriers emitted conform to V9's RMO and V8's PSO memory model, and is compatible with GCC's lowering.
A quick test with
pgbenchon a T4-1 shows some small (up to about 4%), but consistent speedup.