From 89ca62a2e514bfc5764a795e93e96edb82640f64 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Thu, 29 Aug 2024 15:17:40 +0200 Subject: [PATCH 1/3] [X86] Prefer `lock or` over mfence Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. --- llvm/lib/Target/X86/X86.td | 52 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +- llvm/test/CodeGen/X86/atomic-unordered.ll | 765 +++++++++++++++++++++- llvm/test/CodeGen/X86/mfence.ll | 27 + 4 files changed, 810 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 38761e1fd7eec..06c8ed8365bd5 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -772,6 +772,10 @@ def TuningUseGLMDivSqrtCosts def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true", "Target has branch hint feature">; +def TuningAvoidMFENCE + : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true", + "Avoid MFENCE for fence seq_cst, and instead use lock or">; + //===----------------------------------------------------------------------===// // X86 CPU Families // TODO: Remove these - use general tuning features to determine codegen. @@ -833,7 +837,8 @@ def ProcessorFeatures { TuningSlow3OpsLEA, TuningSlowDivide64, TuningSlowIncDec, - TuningInsertVZEROUPPER + TuningInsertVZEROUPPER, + TuningAvoidMFENCE ]; list X86_64V2Features = !listconcat(X86_64V1Features, [ @@ -849,7 +854,8 @@ def ProcessorFeatures { TuningFastSHLDRotate, TuningFast15ByteNOP, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER + TuningInsertVZEROUPPER, + TuningAvoidMFENCE ]; list X86_64V3Features = !listconcat(X86_64V2Features, [ @@ -868,7 +874,8 @@ def ProcessorFeatures { TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit + TuningAllowLight256Bit, + TuningAvoidMFENCE ]; list X86_64V4Features = !listconcat(X86_64V3Features, [ @@ -892,7 +899,8 @@ def ProcessorFeatures { TuningFastGather, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit + TuningAllowLight256Bit, + TuningAvoidMFENCE ]; // Nehalem @@ -900,7 +908,8 @@ def ProcessorFeatures { list NHMTuning = [TuningMacroFusion, TuningSlowDivide64, TuningInsertVZEROUPPER, - TuningNoDomainDelayMov]; + TuningNoDomainDelayMov, + TuningAvoidMFENCE]; // Westmere list WSMAdditionalFeatures = [FeaturePCLMUL]; @@ -921,7 +930,8 @@ def ProcessorFeatures { TuningFast15ByteNOP, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningNoDomainDelayMov]; + TuningNoDomainDelayMov, + TuningAvoidMFENCE]; list SNBFeatures = !listconcat(WSMFeatures, SNBAdditionalFeatures); @@ -987,7 +997,8 @@ def ProcessorFeatures { TuningAllowLight256Bit, TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, - TuningNoDomainDelayBlend]; + TuningNoDomainDelayBlend, + TuningAvoidMFENCE]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -1022,7 +1033,8 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift]; + TuningFastImmVectorShift, + TuningAvoidMFENCE]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -1065,7 +1077,8 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift]; + TuningFastImmVectorShift, + TuningAvoidMFENCE]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -1094,7 +1107,8 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift]; + TuningFastImmVectorShift, + TuningAvoidMFENCE]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -1268,7 +1282,8 @@ def ProcessorFeatures { // Tremont list TRMAdditionalFeatures = [FeatureCLWB, FeatureGFNI]; - list TRMTuning = GLPTuning; + list TRMAdditionalTuning = [TuningAvoidMFENCE]; + list TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning); list TRMFeatures = !listconcat(GLPFeatures, TRMAdditionalFeatures); @@ -1446,7 +1461,8 @@ def ProcessorFeatures { TuningFastImm16, TuningSBBDepBreaking, TuningSlowDivide64, - TuningSlowSHLD]; + TuningSlowSHLD, + TuningAvoidMFENCE]; list BtVer2Features = !listconcat(BtVer1Features, BtVer2AdditionalFeatures); @@ -1475,7 +1491,8 @@ def ProcessorFeatures { TuningFastScalarShiftMasks, TuningBranchFusion, TuningSBBDepBreaking, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; // PileDriver list BdVer2AdditionalFeatures = [FeatureF16C, @@ -1555,7 +1572,8 @@ def ProcessorFeatures { TuningSlowSHLD, TuningSBBDepBreaking, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningAvoidMFENCE]; list ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, FeatureRDPRU, @@ -1740,7 +1758,8 @@ def : ProcModel; } foreach P = ["penryn", "core_2_duo_sse4_1"] in { @@ -1759,7 +1778,8 @@ def : ProcModel; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1bda187810a63..4347b1dec4cf0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31906,7 +31906,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // especially clever. // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct - // lowering for SSID == SyncScope::SingleThread and !hasMFence + // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID); // Finally we can emit the atomic load. @@ -31995,7 +31995,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && FenceSSID == SyncScope::System) { - if (Subtarget.hasMFence()) + if (!Subtarget.avoidMFence() && Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index 3fb994cdb751a..ff101b9037f0e 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O0 %s ; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O3 %s +; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -mattr=-avoid-mfence | FileCheck --check-prefixes=CHECK,CHECK-MFENCE %s define i8 @load_i8(ptr %ptr) { ; CHECK-O0-LABEL: load_i8: @@ -12,6 +13,11 @@ define i8 @load_i8(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i8: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 1 ret i8 %v } @@ -27,6 +33,11 @@ define void @store_i8(ptr %ptr, i8 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movb %sil, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i8: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movb %sil, (%rdi) +; CHECK-MFENCE-NEXT: retq store atomic i8 %v, ptr %ptr unordered, align 1 ret void } @@ -41,6 +52,11 @@ define i16 @load_i16(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzwl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i16: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 2 ret i16 %v } @@ -57,6 +73,11 @@ define void @store_i16(ptr %ptr, i16 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movw %si, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i16: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movw %si, (%rdi) +; CHECK-MFENCE-NEXT: retq store atomic i16 %v, ptr %ptr unordered, align 2 ret void } @@ -116,6 +137,11 @@ define void @narrow_writeback_or(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq $7, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: narrow_writeback_or: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: orq $7, (%rdi) +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = or i64 %v, 7 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -138,6 +164,12 @@ define void @narrow_writeback_and(ptr %ptr) { ; CHECK-O3-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00 ; CHECK-O3-NEXT: andq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: narrow_writeback_and: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00 +; CHECK-MFENCE-NEXT: andq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = and i64 %v, 4294967040 ;; 0xFFFF_FF00 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -157,6 +189,11 @@ define void @narrow_writeback_xor(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq $7, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: narrow_writeback_xor: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: xorq $7, (%rdi) +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = xor i64 %v, 7 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -254,6 +291,14 @@ define void @store_i128(ptr %ptr, i128 %v) { ; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i128: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: vmovq %rdx, %xmm0 +; CHECK-MFENCE-NEXT: vmovq %rsi, %xmm1 +; CHECK-MFENCE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-MFENCE-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-MFENCE-NEXT: retq store atomic i128 %v, ptr %ptr unordered, align 16 ret void } @@ -305,6 +350,28 @@ define i256 @load_i256(ptr %ptr) { ; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O3-NEXT: vzeroupper ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i256: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: pushq %rbx +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-MFENCE-NEXT: subq $32, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48 +; CHECK-MFENCE-NEXT: .cfi_offset %rbx, -16 +; CHECK-MFENCE-NEXT: movq %rdi, %rbx +; CHECK-MFENCE-NEXT: movq %rsp, %rdx +; CHECK-MFENCE-NEXT: movl $32, %edi +; CHECK-MFENCE-NEXT: xorl %ecx, %ecx +; CHECK-MFENCE-NEXT: callq __atomic_load@PLT +; CHECK-MFENCE-NEXT: vmovups (%rsp), %ymm0 +; CHECK-MFENCE-NEXT: vmovups %ymm0, (%rbx) +; CHECK-MFENCE-NEXT: movq %rbx, %rax +; CHECK-MFENCE-NEXT: addq $32, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-MFENCE-NEXT: popq %rbx +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8 +; CHECK-MFENCE-NEXT: vzeroupper +; CHECK-MFENCE-NEXT: retq %v = load atomic i256, ptr %ptr unordered, align 16 ret i256 %v } @@ -345,6 +412,24 @@ define void @store_i256(ptr %ptr, i256 %v) { ; CHECK-O3-NEXT: addq $40, %rsp ; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i256: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: subq $40, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48 +; CHECK-MFENCE-NEXT: movq %rdi, %rax +; CHECK-MFENCE-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; CHECK-MFENCE-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; CHECK-MFENCE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; CHECK-MFENCE-NEXT: movq %rsi, (%rsp) +; CHECK-MFENCE-NEXT: movq %rsp, %rdx +; CHECK-MFENCE-NEXT: movl $32, %edi +; CHECK-MFENCE-NEXT: movq %rax, %rsi +; CHECK-MFENCE-NEXT: xorl %ecx, %ecx +; CHECK-MFENCE-NEXT: callq __atomic_store@PLT +; CHECK-MFENCE-NEXT: addq $40, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8 +; CHECK-MFENCE-NEXT: retq store atomic i256 %v, ptr %ptr unordered, align 16 ret void } @@ -366,6 +451,14 @@ define void @vec_store(ptr %p0, <2 x i32> %vec) { ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: vec_store: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax +; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx +; CHECK-MFENCE-NEXT: movl %eax, (%rdi) +; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi) +; CHECK-MFENCE-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 %p1 = getelementptr i32, ptr %p0, i64 1 @@ -391,6 +484,14 @@ define void @vec_store_unaligned(ptr %p0, <2 x i32> %vec) { ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: vec_store_unaligned: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax +; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx +; CHECK-MFENCE-NEXT: movl %eax, (%rdi) +; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi) +; CHECK-MFENCE-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 %p1 = getelementptr i32, ptr %p0, i64 1 @@ -496,6 +597,12 @@ define i64 @load_fold_add3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: addq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_add3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: addq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = add i64 %v, %v2 @@ -515,6 +622,12 @@ define i64 @load_fold_sub1(ptr %p) { ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: addq $-15, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sub1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: addq $-15, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sub i64 %v, 15 ret i64 %ret @@ -556,6 +669,13 @@ define i64 @load_fold_mul1(ptr %p) { ; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax ; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_mul1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = mul i64 %v, 15 ret i64 %ret @@ -584,6 +704,12 @@ define i64 @load_fold_mul3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: imulq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_mul3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: imulq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = mul i64 %v, %v2 @@ -613,6 +739,20 @@ define i64 @load_fold_sdiv1(ptr %p) { ; CHECK-O3-NEXT: addq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sdiv1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rdx, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rcx +; CHECK-MFENCE-NEXT: addq %rax, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sdiv i64 %v, 15 ret i64 %ret @@ -644,6 +784,24 @@ define i64 @load_fold_sdiv2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sdiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB35_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB35_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sdiv i64 %v, %v2 ret i64 %ret @@ -675,6 +833,25 @@ define i64 @load_fold_sdiv3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sdiv3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB36_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rcx +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB36_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = sdiv i64 %v, %v2 @@ -699,6 +876,14 @@ define i64 @load_fold_udiv1(ptr %p) { ; CHECK-O3-NEXT: mulxq %rax, %rax, %rax ; CHECK-O3-NEXT: shrq $3, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_udiv1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rdx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax +; CHECK-MFENCE-NEXT: shrq $3, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = udiv i64 %v, 15 ret i64 %ret @@ -730,6 +915,24 @@ define i64 @load_fold_udiv2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_udiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB38_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB38_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = udiv i64 %v, %v2 ret i64 %ret @@ -762,6 +965,25 @@ define i64 @load_fold_udiv3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_udiv3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB39_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rcx +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB39_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = udiv i64 %v, %v2 @@ -795,6 +1017,23 @@ define i64 @load_fold_srem1(ptr %p) { ; CHECK-O3-NEXT: subq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_srem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rcx, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rdx +; CHECK-MFENCE-NEXT: addq %rax, %rdx +; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: subq %rax, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = srem i64 %v, 15 ret i64 %ret @@ -828,6 +1067,25 @@ define i64 @load_fold_srem2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_srem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB41_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB41_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = srem i64 %v, %v2 ret i64 %ret @@ -861,6 +1119,26 @@ define i64 @load_fold_srem3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_srem3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB42_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rcx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB42_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = srem i64 %v, %v2 @@ -890,6 +1168,18 @@ define i64 @load_fold_urem1(ptr %p) { ; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx ; CHECK-O3-NEXT: subq %rcx, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_urem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: mulxq %rcx, %rcx, %rcx +; CHECK-MFENCE-NEXT: shrq $3, %rcx +; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,4), %rcx +; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,2), %rcx +; CHECK-MFENCE-NEXT: subq %rcx, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = urem i64 %v, 15 ret i64 %ret @@ -924,6 +1214,25 @@ define i64 @load_fold_urem2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_urem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB44_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB44_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = urem i64 %v, %v2 ret i64 %ret @@ -958,6 +1267,26 @@ define i64 @load_fold_urem3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_urem3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB45_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rcx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB45_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = urem i64 %v, %v2 @@ -989,6 +1318,11 @@ define i64 @load_fold_shl2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_shl2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = shl i64 %v, %v2 ret i64 %ret @@ -1008,6 +1342,12 @@ define i64 @load_fold_shl3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: shlxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_shl3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: shlxq %rax, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = shl i64 %v, %v2 @@ -1039,6 +1379,11 @@ define i64 @load_fold_lshr2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_lshr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = lshr i64 %v, %v2 ret i64 %ret @@ -1058,6 +1403,12 @@ define i64 @load_fold_lshr3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: shrxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_lshr3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: shrxq %rax, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = lshr i64 %v, %v2 @@ -1089,6 +1440,11 @@ define i64 @load_fold_ashr2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_ashr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = ashr i64 %v, %v2 ret i64 %ret @@ -1108,6 +1464,12 @@ define i64 @load_fold_ashr3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: sarxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_ashr3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: sarxq %rax, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = ashr i64 %v, %v2 @@ -1127,6 +1489,12 @@ define i64 @load_fold_and1(ptr %p) { ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: andl $15, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_and1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: andl $15, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = and i64 %v, 15 ret i64 %ret @@ -1155,6 +1523,12 @@ define i64 @load_fold_and3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: andq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_and3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: andq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = and i64 %v, %v2 @@ -1196,6 +1570,12 @@ define i64 @load_fold_or3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: orq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_or3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: orq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = or i64 %v, %v2 @@ -1237,6 +1617,12 @@ define i64 @load_fold_xor3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: xorq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_xor3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: xorq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = xor i64 %v, %v2 @@ -1256,6 +1642,12 @@ define i1 @load_fold_icmp1(ptr %p) { ; CHECK-O3-NEXT: cmpq $15, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_icmp1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: cmpq $15, (%rdi) +; CHECK-MFENCE-NEXT: sete %al +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = icmp eq i64 %v, 15 ret i1 %ret @@ -1274,6 +1666,12 @@ define i1 @load_fold_icmp2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: cmpq %rsi, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_icmp2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: cmpq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: sete %al +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = icmp eq i64 %v, %v2 ret i1 %ret @@ -1294,6 +1692,13 @@ define i1 @load_fold_icmp3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: cmpq %rax, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_icmp3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: cmpq %rax, (%rdi) +; CHECK-MFENCE-NEXT: sete %al +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = icmp eq i64 %v, %v2 @@ -1319,6 +1724,11 @@ define void @rmw_fold_add1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_add1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: addq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = add i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1338,6 +1748,11 @@ define void @rmw_fold_add2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_add2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: addq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = add i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1357,6 +1772,11 @@ define void @rmw_fold_sub1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq $-15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sub1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: addq $-15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sub i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1376,6 +1796,11 @@ define void @rmw_fold_sub2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: subq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sub2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: subq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sub i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1411,6 +1836,12 @@ define void @rmw_fold_mul2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: imulq (%rdi), %rsi ; CHECK-O3-NEXT: movq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_mul2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: imulq (%rdi), %rsi +; CHECK-MFENCE-NEXT: movq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = mul i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1447,6 +1878,20 @@ define void @rmw_fold_sdiv1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: addq %rax, %rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sdiv1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rcx, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rdx +; CHECK-MFENCE-NEXT: addq %rax, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sdiv i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1482,6 +1927,26 @@ define void @rmw_fold_sdiv2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sdiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB74_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB74_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sdiv i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1534,6 +1999,26 @@ define void @rmw_fold_udiv2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_udiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB76_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB76_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = udiv i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1577,6 +2062,23 @@ define void @rmw_fold_srem1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: subq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_srem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rcx, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rdx +; CHECK-MFENCE-NEXT: addq %rax, %rdx +; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: subq %rax, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = srem i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1612,6 +2114,26 @@ define void @rmw_fold_srem2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_srem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB78_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB78_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = srem i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1644,6 +2166,18 @@ define void @rmw_fold_urem1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: subq %rax, %rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_urem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rdx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax +; CHECK-MFENCE-NEXT: shrq $3, %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: subq %rax, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = urem i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1680,6 +2214,26 @@ define void @rmw_fold_urem2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_urem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB80_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB80_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = urem i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1717,6 +2271,12 @@ define void @rmw_fold_shl2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_shl2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = shl i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1754,6 +2314,12 @@ define void @rmw_fold_lshr2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_lshr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = lshr i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1791,6 +2357,12 @@ define void @rmw_fold_ashr2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_ashr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = ashr i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1812,6 +2384,11 @@ define void @rmw_fold_and1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: andq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_and1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: andq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = and i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1831,6 +2408,11 @@ define void @rmw_fold_and2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: andq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_and2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: andq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = and i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1850,6 +2432,11 @@ define void @rmw_fold_or1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_or1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: orq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = or i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1869,6 +2456,11 @@ define void @rmw_fold_or2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_or2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: orq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = or i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1888,6 +2480,11 @@ define void @rmw_fold_xor1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_xor1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: xorq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = xor i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1907,6 +2504,11 @@ define void @rmw_fold_xor2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_xor2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: xorq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = xor i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1943,6 +2545,13 @@ define i32 @fold_trunc_add(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: addl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_trunc_add: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: addl %esi, %eax +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = add i32 %trunc, %v2 @@ -1964,6 +2573,13 @@ define i32 @fold_trunc_and(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: andl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_trunc_and: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: andl %esi, %eax +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = and i32 %trunc, %v2 @@ -1985,6 +2601,13 @@ define i32 @fold_trunc_or(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: orl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_trunc_or: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: orl %esi, %eax +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = or i32 %trunc, %v2 @@ -2012,6 +2635,15 @@ define i32 @split_load(ptr %p) { ; CHECK-O3-NEXT: orl %eax, %ecx ; CHECK-O3-NEXT: movzbl %cl, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: split_load: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: orl %eax, %ecx +; CHECK-MFENCE-NEXT: movzbl %cl, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %b1 = trunc i64 %v to i8 %v.shift = lshr i64 %v, 32 @@ -2093,12 +2725,26 @@ define void @dead_store(ptr %p, i64 %v) { ;; isn't violated. define i64 @nofold_fence(ptr %p) { -; CHECK-LABEL: nofold_fence: -; CHECK: # %bb.0: -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: mfence -; CHECK-NEXT: addq $15, %rax -; CHECK-NEXT: retq +; CHECK-O0-LABEL: nofold_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O0-NEXT: addq $15, %rax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: nofold_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: addq $15, %rax +; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: nofold_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: addq $15, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 fence seq_cst %ret = add i64 %v, 15 @@ -2148,6 +2794,12 @@ define i64 @fold_constant(i64 %arg) { ; CHECK-O3-NEXT: movq %rdi, %rax ; CHECK-O3-NEXT: addq Constant(%rip), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_constant: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq %rdi, %rax +; CHECK-MFENCE-NEXT: addq Constant(%rip), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr @Constant unordered, align 8 %ret = add i64 %v, %arg ret i64 %ret @@ -2167,12 +2819,26 @@ define i64 @fold_constant_clobber(ptr %p, i64 %arg) { } define i64 @fold_constant_fence(i64 %arg) { -; CHECK-LABEL: fold_constant_fence: -; CHECK: # %bb.0: -; CHECK-NEXT: movq Constant(%rip), %rax -; CHECK-NEXT: mfence -; CHECK-NEXT: addq %rdi, %rax -; CHECK-NEXT: retq +; CHECK-O0-LABEL: fold_constant_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq Constant(%rip), %rax +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O0-NEXT: addq %rdi, %rax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: fold_constant_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq Constant(%rip), %rax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: addq %rdi, %rax +; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_constant_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq Constant(%rip), %rax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: addq %rdi, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr @Constant unordered, align 8 fence seq_cst %ret = add i64 %v, %arg @@ -2194,12 +2860,26 @@ define i64 @fold_invariant_clobber(ptr dereferenceable(8) %p, i64 %arg) { define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) { -; CHECK-LABEL: fold_invariant_fence: -; CHECK: # %bb.0: -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: mfence -; CHECK-NEXT: addq %rsi, %rax -; CHECK-NEXT: retq +; CHECK-O0-LABEL: fold_invariant_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O0-NEXT: addq %rsi, %rax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: fold_invariant_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: addq %rsi, %rax +; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_invariant_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: addq %rsi, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{} fence seq_cst %ret = add i64 %v, %arg @@ -2222,6 +2902,12 @@ define i16 @load_i8_anyext_i16(ptr %ptr) { ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i8_anyext_i16: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax +; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 2 %vec = insertelement <2 x i8> undef, i8 %v, i32 0 %res = bitcast <2 x i8> %vec to i16 @@ -2239,6 +2925,11 @@ define i32 @load_i8_anyext_i32(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i8_anyext_i32: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 4 %vec = insertelement <4 x i8> undef, i8 %v, i32 0 %res = bitcast <4 x i8> %vec to i32 @@ -2257,6 +2948,11 @@ define i32 @load_i16_anyext_i32(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzwl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i16_anyext_i32: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 4 %vec = insertelement <2 x i16> undef, i16 %v, i64 0 %res = bitcast <2 x i16> %vec to i32 @@ -2279,6 +2975,13 @@ define i64 @load_i16_anyext_i64(ptr %ptr) { ; CHECK-O3-NEXT: vmovd %eax, %xmm0 ; CHECK-O3-NEXT: vmovq %xmm0, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i16_anyext_i64: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax +; CHECK-MFENCE-NEXT: vmovd %eax, %xmm0 +; CHECK-MFENCE-NEXT: vmovq %xmm0, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 8 %vec = insertelement <4 x i16> undef, i16 %v, i64 0 %res = bitcast <4 x i16> %vec to i64 @@ -2307,6 +3010,15 @@ define i16 @load_combine(ptr %p) { ; CHECK-O3-NEXT: orl %ecx, %eax ; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_combine: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %ecx +; CHECK-MFENCE-NEXT: movzbl 1(%rdi), %eax +; CHECK-MFENCE-NEXT: shll $8, %eax +; CHECK-MFENCE-NEXT: orl %ecx, %eax +; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-MFENCE-NEXT: retq %v1 = load atomic i8, ptr %p unordered, align 2 %p2 = getelementptr i8, ptr %p, i64 1 %v2 = load atomic i8, ptr %p2 unordered, align 1 @@ -2321,7 +3033,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O0-LABEL: fold_cmp_over_fence: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movl (%rdi), %eax -; CHECK-O0-NEXT: mfence +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: cmpl %eax, %esi ; CHECK-O0-NEXT: jne .LBB116_2 ; CHECK-O0-NEXT: # %bb.1: # %taken @@ -2335,7 +3047,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O3-LABEL: fold_cmp_over_fence: ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movl (%rdi), %eax -; CHECK-O3-NEXT: mfence +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O3-NEXT: cmpl %eax, %esi ; CHECK-O3-NEXT: jne .LBB116_2 ; CHECK-O3-NEXT: # %bb.1: # %taken @@ -2344,6 +3056,19 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O3-NEXT: .LBB116_2: # %untaken ; CHECK-O3-NEXT: xorl %eax, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_cmp_over_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movl (%rdi), %eax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: cmpl %eax, %esi +; CHECK-MFENCE-NEXT: jne .LBB116_2 +; CHECK-MFENCE-NEXT: # %bb.1: # %taken +; CHECK-MFENCE-NEXT: movb $1, %al +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB116_2: # %untaken +; CHECK-MFENCE-NEXT: xorl %eax, %eax +; CHECK-MFENCE-NEXT: retq %v2 = load atomic i32, ptr %p unordered, align 4 fence seq_cst %cmp = icmp eq i32 %v1, %v2 diff --git a/llvm/test/CodeGen/X86/mfence.ll b/llvm/test/CodeGen/X86/mfence.ll index 8c29af8648712..1be1e4793461b 100644 --- a/llvm/test/CodeGen/X86/mfence.ll +++ b/llvm/test/CodeGen/X86/mfence.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+avoid-mfence | FileCheck %s --check-prefix=X64-NO-MFENCE ; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence. @@ -26,7 +27,33 @@ define i32 @fence(ptr %ptr) { ; X64-NEXT: mfence ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq +; +; X64-NO-MFENCE-LABEL: fence: +; X64-NO-MFENCE: # %bb.0: +; X64-NO-MFENCE-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; X64-NO-MFENCE-NEXT: movl (%rdi), %eax +; X64-NO-MFENCE-NEXT: retq %atomic = atomicrmw add ptr %ptr, i32 0 seq_cst ret i32 %atomic } +define void @mfence() nounwind { +; X32-LABEL: mfence: +; X32: # %bb.0: +; X32-NEXT: mfence +; X32-NEXT: retl +; +; X64-LABEL: mfence: +; X64: # %bb.0: +; X64-NEXT: mfence +; X64-NEXT: retq +; +; X64-NO-MFENCE-LABEL: mfence: +; X64-NO-MFENCE: # %bb.0: +; X64-NO-MFENCE-NEXT: mfence +; X64-NO-MFENCE-NEXT: retq + call void @llvm.x86.sse2.mfence() + ret void +} +declare void @llvm.x86.sse2.mfence() nounwind readnone + From 422a1d1b422880b385812673c71cb3e35766661e Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 11 Mar 2025 11:49:56 +0100 Subject: [PATCH 2/3] don't use tuning for avoid mfence --- llvm/lib/Target/X86/X86.td | 48 +- llvm/lib/Target/X86/X86Subtarget.h | 3 + llvm/test/CodeGen/X86/atomic-idempotent.ll | 10 +- llvm/test/CodeGen/X86/atomic-unordered.ll | 761 +------------------ llvm/test/CodeGen/X86/implicit-null-check.ll | 3 +- llvm/test/CodeGen/X86/membarrier.ll | 4 +- llvm/test/CodeGen/X86/mfence.ll | 35 +- 7 files changed, 59 insertions(+), 805 deletions(-) diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 06c8ed8365bd5..aa59d9a70a212 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -772,9 +772,6 @@ def TuningUseGLMDivSqrtCosts def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true", "Target has branch hint feature">; -def TuningAvoidMFENCE - : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true", - "Avoid MFENCE for fence seq_cst, and instead use lock or">; //===----------------------------------------------------------------------===// // X86 CPU Families @@ -837,8 +834,7 @@ def ProcessorFeatures { TuningSlow3OpsLEA, TuningSlowDivide64, TuningSlowIncDec, - TuningInsertVZEROUPPER, - TuningAvoidMFENCE + TuningInsertVZEROUPPER ]; list X86_64V2Features = !listconcat(X86_64V1Features, [ @@ -854,8 +850,7 @@ def ProcessorFeatures { TuningFastSHLDRotate, TuningFast15ByteNOP, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER, - TuningAvoidMFENCE + TuningInsertVZEROUPPER ]; list X86_64V3Features = !listconcat(X86_64V2Features, [ @@ -874,8 +869,7 @@ def ProcessorFeatures { TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit, - TuningAvoidMFENCE + TuningAllowLight256Bit ]; list X86_64V4Features = !listconcat(X86_64V3Features, [ @@ -899,8 +893,7 @@ def ProcessorFeatures { TuningFastGather, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit, - TuningAvoidMFENCE + TuningAllowLight256Bit ]; // Nehalem @@ -908,8 +901,7 @@ def ProcessorFeatures { list NHMTuning = [TuningMacroFusion, TuningSlowDivide64, TuningInsertVZEROUPPER, - TuningNoDomainDelayMov, - TuningAvoidMFENCE]; + TuningNoDomainDelayMov]; // Westmere list WSMAdditionalFeatures = [FeaturePCLMUL]; @@ -930,8 +922,7 @@ def ProcessorFeatures { TuningFast15ByteNOP, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningNoDomainDelayMov, - TuningAvoidMFENCE]; + TuningNoDomainDelayMov]; list SNBFeatures = !listconcat(WSMFeatures, SNBAdditionalFeatures); @@ -997,8 +988,7 @@ def ProcessorFeatures { TuningAllowLight256Bit, TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, - TuningNoDomainDelayBlend, - TuningAvoidMFENCE]; + TuningNoDomainDelayBlend]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -1033,8 +1023,7 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift, - TuningAvoidMFENCE]; + TuningFastImmVectorShift]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -1077,8 +1066,7 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift, - TuningAvoidMFENCE]; + TuningFastImmVectorShift]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -1107,8 +1095,7 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift, - TuningAvoidMFENCE]; + TuningFastImmVectorShift]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -1282,7 +1269,7 @@ def ProcessorFeatures { // Tremont list TRMAdditionalFeatures = [FeatureCLWB, FeatureGFNI]; - list TRMAdditionalTuning = [TuningAvoidMFENCE]; + list TRMAdditionalTuning = []; list TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning); list TRMFeatures = !listconcat(GLPFeatures, TRMAdditionalFeatures); @@ -1461,8 +1448,7 @@ def ProcessorFeatures { TuningFastImm16, TuningSBBDepBreaking, TuningSlowDivide64, - TuningSlowSHLD, - TuningAvoidMFENCE]; + TuningSlowSHLD]; list BtVer2Features = !listconcat(BtVer1Features, BtVer2AdditionalFeatures); @@ -1491,8 +1477,7 @@ def ProcessorFeatures { TuningFastScalarShiftMasks, TuningBranchFusion, TuningSBBDepBreaking, - TuningInsertVZEROUPPER, - TuningAvoidMFENCE]; + TuningInsertVZEROUPPER]; // PileDriver list BdVer2AdditionalFeatures = [FeatureF16C, @@ -1572,8 +1557,7 @@ def ProcessorFeatures { TuningSlowSHLD, TuningSBBDepBreaking, TuningInsertVZEROUPPER, - TuningAllowLight256Bit, - TuningAvoidMFENCE]; + TuningAllowLight256Bit]; list ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, FeatureRDPRU, @@ -1759,7 +1743,6 @@ def : ProcModel; } foreach P = ["penryn", "core_2_duo_sse4_1"] in { @@ -1778,8 +1761,7 @@ def : ProcModel; } diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 722076ca88c9c..8f2d326a69398 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -280,6 +280,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// supports it. bool hasMFence() const { return hasSSE2() || is64Bit(); } + /// Avoid use of `mfence` for`fence seq_cst`, and instead use `lock or`. + bool avoidMFence() const { return is64Bit(); } + const Triple &getTargetTriple() const { return TargetTriple; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll index 91355bd64cade..020f9eb793102 100644 --- a/llvm/test/CodeGen/X86/atomic-idempotent.ll +++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll @@ -14,7 +14,7 @@ define i8 @add8(ptr %p) #0 { ; X64-LABEL: add8: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: retq ; @@ -47,7 +47,7 @@ define i8 @add8(ptr %p) #0 { define i16 @or16(ptr %p) #0 { ; X64-LABEL: or16: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: retq ; @@ -80,7 +80,7 @@ define i16 @or16(ptr %p) #0 { define i32 @xor32(ptr %p) #0 { ; X64-LABEL: xor32: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq ; @@ -113,7 +113,7 @@ define i32 @xor32(ptr %p) #0 { define i64 @sub64(ptr %p) #0 { ; X64-LABEL: sub64: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: retq ; @@ -265,7 +265,7 @@ define i128 @or128(ptr %p) #0 { define i32 @and32 (ptr %p) #0 { ; X64-LABEL: and32: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index ff101b9037f0e..e8e0ee0b7ef49 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O0 %s ; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O3 %s -; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -mattr=-avoid-mfence | FileCheck --check-prefixes=CHECK,CHECK-MFENCE %s define i8 @load_i8(ptr %ptr) { ; CHECK-O0-LABEL: load_i8: @@ -13,11 +12,6 @@ define i8 @load_i8(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_i8: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 1 ret i8 %v } @@ -33,11 +27,6 @@ define void @store_i8(ptr %ptr, i8 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movb %sil, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: store_i8: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movb %sil, (%rdi) -; CHECK-MFENCE-NEXT: retq store atomic i8 %v, ptr %ptr unordered, align 1 ret void } @@ -52,11 +41,6 @@ define i16 @load_i16(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzwl (%rdi), %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_i16: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 2 ret i16 %v } @@ -73,11 +57,6 @@ define void @store_i16(ptr %ptr, i16 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movw %si, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: store_i16: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movw %si, (%rdi) -; CHECK-MFENCE-NEXT: retq store atomic i16 %v, ptr %ptr unordered, align 2 ret void } @@ -137,11 +116,6 @@ define void @narrow_writeback_or(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq $7, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: narrow_writeback_or: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: orq $7, (%rdi) -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = or i64 %v, 7 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -164,12 +138,6 @@ define void @narrow_writeback_and(ptr %ptr) { ; CHECK-O3-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00 ; CHECK-O3-NEXT: andq %rax, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: narrow_writeback_and: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00 -; CHECK-MFENCE-NEXT: andq %rax, (%rdi) -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = and i64 %v, 4294967040 ;; 0xFFFF_FF00 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -189,11 +157,6 @@ define void @narrow_writeback_xor(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq $7, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: narrow_writeback_xor: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: xorq $7, (%rdi) -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = xor i64 %v, 7 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -291,14 +254,6 @@ define void @store_i128(ptr %ptr, i128 %v) { ; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: store_i128: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: vmovq %rdx, %xmm0 -; CHECK-MFENCE-NEXT: vmovq %rsi, %xmm1 -; CHECK-MFENCE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-MFENCE-NEXT: vmovdqa %xmm0, (%rdi) -; CHECK-MFENCE-NEXT: retq store atomic i128 %v, ptr %ptr unordered, align 16 ret void } @@ -350,28 +305,6 @@ define i256 @load_i256(ptr %ptr) { ; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O3-NEXT: vzeroupper ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_i256: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: pushq %rbx -; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16 -; CHECK-MFENCE-NEXT: subq $32, %rsp -; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48 -; CHECK-MFENCE-NEXT: .cfi_offset %rbx, -16 -; CHECK-MFENCE-NEXT: movq %rdi, %rbx -; CHECK-MFENCE-NEXT: movq %rsp, %rdx -; CHECK-MFENCE-NEXT: movl $32, %edi -; CHECK-MFENCE-NEXT: xorl %ecx, %ecx -; CHECK-MFENCE-NEXT: callq __atomic_load@PLT -; CHECK-MFENCE-NEXT: vmovups (%rsp), %ymm0 -; CHECK-MFENCE-NEXT: vmovups %ymm0, (%rbx) -; CHECK-MFENCE-NEXT: movq %rbx, %rax -; CHECK-MFENCE-NEXT: addq $32, %rsp -; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16 -; CHECK-MFENCE-NEXT: popq %rbx -; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8 -; CHECK-MFENCE-NEXT: vzeroupper -; CHECK-MFENCE-NEXT: retq %v = load atomic i256, ptr %ptr unordered, align 16 ret i256 %v } @@ -412,24 +345,6 @@ define void @store_i256(ptr %ptr, i256 %v) { ; CHECK-O3-NEXT: addq $40, %rsp ; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: store_i256: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: subq $40, %rsp -; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48 -; CHECK-MFENCE-NEXT: movq %rdi, %rax -; CHECK-MFENCE-NEXT: movq %r8, {{[0-9]+}}(%rsp) -; CHECK-MFENCE-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-MFENCE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-MFENCE-NEXT: movq %rsi, (%rsp) -; CHECK-MFENCE-NEXT: movq %rsp, %rdx -; CHECK-MFENCE-NEXT: movl $32, %edi -; CHECK-MFENCE-NEXT: movq %rax, %rsi -; CHECK-MFENCE-NEXT: xorl %ecx, %ecx -; CHECK-MFENCE-NEXT: callq __atomic_store@PLT -; CHECK-MFENCE-NEXT: addq $40, %rsp -; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8 -; CHECK-MFENCE-NEXT: retq store atomic i256 %v, ptr %ptr unordered, align 16 ret void } @@ -451,14 +366,6 @@ define void @vec_store(ptr %p0, <2 x i32> %vec) { ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: vec_store: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax -; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx -; CHECK-MFENCE-NEXT: movl %eax, (%rdi) -; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi) -; CHECK-MFENCE-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 %p1 = getelementptr i32, ptr %p0, i64 1 @@ -484,14 +391,6 @@ define void @vec_store_unaligned(ptr %p0, <2 x i32> %vec) { ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: vec_store_unaligned: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax -; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx -; CHECK-MFENCE-NEXT: movl %eax, (%rdi) -; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi) -; CHECK-MFENCE-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 %p1 = getelementptr i32, ptr %p0, i64 1 @@ -597,12 +496,6 @@ define i64 @load_fold_add3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: addq (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_add3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rsi), %rax -; CHECK-MFENCE-NEXT: addq (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = add i64 %v, %v2 @@ -622,12 +515,6 @@ define i64 @load_fold_sub1(ptr %p) { ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: addq $-15, %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_sub1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: addq $-15, %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sub i64 %v, 15 ret i64 %ret @@ -669,13 +556,6 @@ define i64 @load_fold_mul1(ptr %p) { ; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax ; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_mul1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax -; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = mul i64 %v, 15 ret i64 %ret @@ -704,12 +584,6 @@ define i64 @load_fold_mul3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: imulq (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_mul3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rsi), %rax -; CHECK-MFENCE-NEXT: imulq (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = mul i64 %v, %v2 @@ -739,20 +613,6 @@ define i64 @load_fold_sdiv1(ptr %p) { ; CHECK-O3-NEXT: addq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_sdiv1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rcx -; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 -; CHECK-MFENCE-NEXT: movq %rcx, %rax -; CHECK-MFENCE-NEXT: imulq %rdx -; CHECK-MFENCE-NEXT: addq %rdx, %rcx -; CHECK-MFENCE-NEXT: movq %rcx, %rax -; CHECK-MFENCE-NEXT: shrq $63, %rax -; CHECK-MFENCE-NEXT: sarq $3, %rcx -; CHECK-MFENCE-NEXT: addq %rax, %rcx -; CHECK-MFENCE-NEXT: movq %rcx, %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sdiv i64 %v, 15 ret i64 %ret @@ -784,24 +644,6 @@ define i64 @load_fold_sdiv2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_sdiv2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, %rcx -; CHECK-MFENCE-NEXT: orq %rsi, %rcx -; CHECK-MFENCE-NEXT: shrq $32, %rcx -; CHECK-MFENCE-NEXT: je .LBB35_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: cqto -; CHECK-MFENCE-NEXT: idivq %rsi -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB35_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %esi -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sdiv i64 %v, %v2 ret i64 %ret @@ -833,25 +675,6 @@ define i64 @load_fold_sdiv3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_sdiv3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq (%rsi), %rcx -; CHECK-MFENCE-NEXT: movq %rax, %rdx -; CHECK-MFENCE-NEXT: orq %rcx, %rdx -; CHECK-MFENCE-NEXT: shrq $32, %rdx -; CHECK-MFENCE-NEXT: je .LBB36_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: cqto -; CHECK-MFENCE-NEXT: idivq %rcx -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB36_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %ecx -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = sdiv i64 %v, %v2 @@ -876,14 +699,6 @@ define i64 @load_fold_udiv1(ptr %p) { ; CHECK-O3-NEXT: mulxq %rax, %rax, %rax ; CHECK-O3-NEXT: shrq $3, %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_udiv1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rdx -; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 -; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax -; CHECK-MFENCE-NEXT: shrq $3, %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = udiv i64 %v, 15 ret i64 %ret @@ -915,24 +730,6 @@ define i64 @load_fold_udiv2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_udiv2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, %rcx -; CHECK-MFENCE-NEXT: orq %rsi, %rcx -; CHECK-MFENCE-NEXT: shrq $32, %rcx -; CHECK-MFENCE-NEXT: je .LBB38_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divq %rsi -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB38_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %esi -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = udiv i64 %v, %v2 ret i64 %ret @@ -965,25 +762,6 @@ define i64 @load_fold_udiv3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_udiv3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq (%rsi), %rcx -; CHECK-MFENCE-NEXT: movq %rax, %rdx -; CHECK-MFENCE-NEXT: orq %rcx, %rdx -; CHECK-MFENCE-NEXT: shrq $32, %rdx -; CHECK-MFENCE-NEXT: je .LBB39_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divq %rcx -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB39_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %ecx -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = udiv i64 %v, %v2 @@ -1017,23 +795,6 @@ define i64 @load_fold_srem1(ptr %p) { ; CHECK-O3-NEXT: subq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_srem1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rcx -; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 -; CHECK-MFENCE-NEXT: movq %rcx, %rax -; CHECK-MFENCE-NEXT: imulq %rdx -; CHECK-MFENCE-NEXT: addq %rcx, %rdx -; CHECK-MFENCE-NEXT: movq %rdx, %rax -; CHECK-MFENCE-NEXT: shrq $63, %rax -; CHECK-MFENCE-NEXT: sarq $3, %rdx -; CHECK-MFENCE-NEXT: addq %rax, %rdx -; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax -; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax -; CHECK-MFENCE-NEXT: subq %rax, %rcx -; CHECK-MFENCE-NEXT: movq %rcx, %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = srem i64 %v, 15 ret i64 %ret @@ -1067,25 +828,6 @@ define i64 @load_fold_srem2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_srem2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, %rcx -; CHECK-MFENCE-NEXT: orq %rsi, %rcx -; CHECK-MFENCE-NEXT: shrq $32, %rcx -; CHECK-MFENCE-NEXT: je .LBB41_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: cqto -; CHECK-MFENCE-NEXT: idivq %rsi -; CHECK-MFENCE-NEXT: movq %rdx, %rax -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB41_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %esi -; CHECK-MFENCE-NEXT: movl %edx, %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = srem i64 %v, %v2 ret i64 %ret @@ -1119,26 +861,6 @@ define i64 @load_fold_srem3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_srem3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq (%rsi), %rcx -; CHECK-MFENCE-NEXT: movq %rax, %rdx -; CHECK-MFENCE-NEXT: orq %rcx, %rdx -; CHECK-MFENCE-NEXT: shrq $32, %rdx -; CHECK-MFENCE-NEXT: je .LBB42_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: cqto -; CHECK-MFENCE-NEXT: idivq %rcx -; CHECK-MFENCE-NEXT: movq %rdx, %rax -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB42_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %ecx -; CHECK-MFENCE-NEXT: movl %edx, %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = srem i64 %v, %v2 @@ -1168,18 +890,6 @@ define i64 @load_fold_urem1(ptr %p) { ; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx ; CHECK-O3-NEXT: subq %rcx, %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_urem1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 -; CHECK-MFENCE-NEXT: movq %rax, %rdx -; CHECK-MFENCE-NEXT: mulxq %rcx, %rcx, %rcx -; CHECK-MFENCE-NEXT: shrq $3, %rcx -; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,4), %rcx -; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,2), %rcx -; CHECK-MFENCE-NEXT: subq %rcx, %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = urem i64 %v, 15 ret i64 %ret @@ -1214,25 +924,6 @@ define i64 @load_fold_urem2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_urem2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, %rcx -; CHECK-MFENCE-NEXT: orq %rsi, %rcx -; CHECK-MFENCE-NEXT: shrq $32, %rcx -; CHECK-MFENCE-NEXT: je .LBB44_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divq %rsi -; CHECK-MFENCE-NEXT: movq %rdx, %rax -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB44_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %esi -; CHECK-MFENCE-NEXT: movl %edx, %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = urem i64 %v, %v2 ret i64 %ret @@ -1267,26 +958,6 @@ define i64 @load_fold_urem3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_urem3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq (%rsi), %rcx -; CHECK-MFENCE-NEXT: movq %rax, %rdx -; CHECK-MFENCE-NEXT: orq %rcx, %rdx -; CHECK-MFENCE-NEXT: shrq $32, %rdx -; CHECK-MFENCE-NEXT: je .LBB45_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divq %rcx -; CHECK-MFENCE-NEXT: movq %rdx, %rax -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB45_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %ecx -; CHECK-MFENCE-NEXT: movl %edx, %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = urem i64 %v, %v2 @@ -1318,11 +989,6 @@ define i64 @load_fold_shl2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_shl2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = shl i64 %v, %v2 ret i64 %ret @@ -1342,12 +1008,6 @@ define i64 @load_fold_shl3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: shlxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_shl3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rsi), %rax -; CHECK-MFENCE-NEXT: shlxq %rax, (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = shl i64 %v, %v2 @@ -1379,11 +1039,6 @@ define i64 @load_fold_lshr2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_lshr2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = lshr i64 %v, %v2 ret i64 %ret @@ -1403,12 +1058,6 @@ define i64 @load_fold_lshr3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: shrxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_lshr3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rsi), %rax -; CHECK-MFENCE-NEXT: shrxq %rax, (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = lshr i64 %v, %v2 @@ -1440,11 +1089,6 @@ define i64 @load_fold_ashr2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_ashr2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = ashr i64 %v, %v2 ret i64 %ret @@ -1464,12 +1108,6 @@ define i64 @load_fold_ashr3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: sarxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_ashr3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rsi), %rax -; CHECK-MFENCE-NEXT: sarxq %rax, (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = ashr i64 %v, %v2 @@ -1489,12 +1127,6 @@ define i64 @load_fold_and1(ptr %p) { ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: andl $15, %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_and1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: andl $15, %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = and i64 %v, 15 ret i64 %ret @@ -1523,12 +1155,6 @@ define i64 @load_fold_and3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: andq (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_and3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rsi), %rax -; CHECK-MFENCE-NEXT: andq (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = and i64 %v, %v2 @@ -1570,12 +1196,6 @@ define i64 @load_fold_or3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: orq (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_or3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rsi), %rax -; CHECK-MFENCE-NEXT: orq (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = or i64 %v, %v2 @@ -1617,12 +1237,6 @@ define i64 @load_fold_xor3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: xorq (%rdi), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_xor3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rsi), %rax -; CHECK-MFENCE-NEXT: xorq (%rdi), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = xor i64 %v, %v2 @@ -1642,12 +1256,6 @@ define i1 @load_fold_icmp1(ptr %p) { ; CHECK-O3-NEXT: cmpq $15, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_icmp1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: cmpq $15, (%rdi) -; CHECK-MFENCE-NEXT: sete %al -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = icmp eq i64 %v, 15 ret i1 %ret @@ -1666,12 +1274,6 @@ define i1 @load_fold_icmp2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: cmpq %rsi, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_icmp2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: cmpq %rsi, (%rdi) -; CHECK-MFENCE-NEXT: sete %al -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = icmp eq i64 %v, %v2 ret i1 %ret @@ -1692,13 +1294,6 @@ define i1 @load_fold_icmp3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: cmpq %rax, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_fold_icmp3: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rsi), %rax -; CHECK-MFENCE-NEXT: cmpq %rax, (%rdi) -; CHECK-MFENCE-NEXT: sete %al -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = icmp eq i64 %v, %v2 @@ -1724,11 +1319,6 @@ define void @rmw_fold_add1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq $15, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_add1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: addq $15, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = add i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1748,11 +1338,6 @@ define void @rmw_fold_add2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq %rsi, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_add2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: addq %rsi, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = add i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1772,11 +1357,6 @@ define void @rmw_fold_sub1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq $-15, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_sub1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: addq $-15, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sub i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1796,11 +1376,6 @@ define void @rmw_fold_sub2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: subq %rsi, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_sub2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: subq %rsi, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sub i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1836,12 +1411,6 @@ define void @rmw_fold_mul2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: imulq (%rdi), %rsi ; CHECK-O3-NEXT: movq %rsi, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_mul2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: imulq (%rdi), %rsi -; CHECK-MFENCE-NEXT: movq %rsi, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = mul i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1878,20 +1447,6 @@ define void @rmw_fold_sdiv1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: addq %rax, %rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_sdiv1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rcx -; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 -; CHECK-MFENCE-NEXT: movq %rcx, %rax -; CHECK-MFENCE-NEXT: imulq %rdx -; CHECK-MFENCE-NEXT: addq %rcx, %rdx -; CHECK-MFENCE-NEXT: movq %rdx, %rax -; CHECK-MFENCE-NEXT: shrq $63, %rax -; CHECK-MFENCE-NEXT: sarq $3, %rdx -; CHECK-MFENCE-NEXT: addq %rax, %rdx -; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sdiv i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1927,26 +1482,6 @@ define void @rmw_fold_sdiv2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_sdiv2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, %rcx -; CHECK-MFENCE-NEXT: orq %rsi, %rcx -; CHECK-MFENCE-NEXT: shrq $32, %rcx -; CHECK-MFENCE-NEXT: je .LBB74_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: cqto -; CHECK-MFENCE-NEXT: idivq %rsi -; CHECK-MFENCE-NEXT: movq %rax, (%rdi) -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB74_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %esi -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax -; CHECK-MFENCE-NEXT: movq %rax, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sdiv i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1999,26 +1534,6 @@ define void @rmw_fold_udiv2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_udiv2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, %rcx -; CHECK-MFENCE-NEXT: orq %rsi, %rcx -; CHECK-MFENCE-NEXT: shrq $32, %rcx -; CHECK-MFENCE-NEXT: je .LBB76_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divq %rsi -; CHECK-MFENCE-NEXT: movq %rax, (%rdi) -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB76_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %esi -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax -; CHECK-MFENCE-NEXT: movq %rax, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = udiv i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -2062,23 +1577,6 @@ define void @rmw_fold_srem1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: subq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_srem1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rcx -; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 -; CHECK-MFENCE-NEXT: movq %rcx, %rax -; CHECK-MFENCE-NEXT: imulq %rdx -; CHECK-MFENCE-NEXT: addq %rcx, %rdx -; CHECK-MFENCE-NEXT: movq %rdx, %rax -; CHECK-MFENCE-NEXT: shrq $63, %rax -; CHECK-MFENCE-NEXT: sarq $3, %rdx -; CHECK-MFENCE-NEXT: addq %rax, %rdx -; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax -; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax -; CHECK-MFENCE-NEXT: subq %rax, %rcx -; CHECK-MFENCE-NEXT: movq %rcx, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = srem i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -2114,26 +1612,6 @@ define void @rmw_fold_srem2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_srem2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, %rcx -; CHECK-MFENCE-NEXT: orq %rsi, %rcx -; CHECK-MFENCE-NEXT: shrq $32, %rcx -; CHECK-MFENCE-NEXT: je .LBB78_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: cqto -; CHECK-MFENCE-NEXT: idivq %rsi -; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB78_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %esi -; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx -; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = srem i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -2166,18 +1644,6 @@ define void @rmw_fold_urem1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: subq %rax, %rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_urem1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rdx -; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 -; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax -; CHECK-MFENCE-NEXT: shrq $3, %rax -; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax -; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax -; CHECK-MFENCE-NEXT: subq %rax, %rdx -; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = urem i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -2214,26 +1680,6 @@ define void @rmw_fold_urem2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_urem2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, %rcx -; CHECK-MFENCE-NEXT: orq %rsi, %rcx -; CHECK-MFENCE-NEXT: shrq $32, %rcx -; CHECK-MFENCE-NEXT: je .LBB80_1 -; CHECK-MFENCE-NEXT: # %bb.2: -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divq %rsi -; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB80_1: -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: xorl %edx, %edx -; CHECK-MFENCE-NEXT: divl %esi -; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx -; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = urem i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -2271,12 +1717,6 @@ define void @rmw_fold_shl2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_shl2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = shl i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -2314,12 +1754,6 @@ define void @rmw_fold_lshr2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_lshr2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = lshr i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -2357,12 +1791,6 @@ define void @rmw_fold_ashr2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_ashr2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = ashr i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -2384,11 +1812,6 @@ define void @rmw_fold_and1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: andq $15, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_and1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: andq $15, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = and i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -2408,11 +1831,6 @@ define void @rmw_fold_and2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: andq %rsi, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_and2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: andq %rsi, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = and i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -2432,11 +1850,6 @@ define void @rmw_fold_or1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq $15, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_or1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: orq $15, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = or i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -2456,11 +1869,6 @@ define void @rmw_fold_or2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq %rsi, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_or2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: orq %rsi, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = or i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -2480,11 +1888,6 @@ define void @rmw_fold_xor1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq $15, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_xor1: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: xorq $15, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = xor i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -2504,11 +1907,6 @@ define void @rmw_fold_xor2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq %rsi, (%rdi) ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: rmw_fold_xor2: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: xorq %rsi, (%rdi) -; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = xor i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -2545,13 +1943,6 @@ define i32 @fold_trunc_add(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: addl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: fold_trunc_add: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: addl %esi, %eax -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = add i32 %trunc, %v2 @@ -2573,13 +1964,6 @@ define i32 @fold_trunc_and(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: andl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: fold_trunc_and: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: andl %esi, %eax -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = and i32 %trunc, %v2 @@ -2601,13 +1985,6 @@ define i32 @fold_trunc_or(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: orl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: fold_trunc_or: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: orl %esi, %eax -; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = or i32 %trunc, %v2 @@ -2635,15 +2012,6 @@ define i32 @split_load(ptr %p) { ; CHECK-O3-NEXT: orl %eax, %ecx ; CHECK-O3-NEXT: movzbl %cl, %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: split_load: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: movq %rax, %rcx -; CHECK-MFENCE-NEXT: shrq $32, %rcx -; CHECK-MFENCE-NEXT: orl %eax, %ecx -; CHECK-MFENCE-NEXT: movzbl %cl, %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %b1 = trunc i64 %v to i8 %v.shift = lshr i64 %v, 32 @@ -2725,26 +2093,12 @@ define void @dead_store(ptr %p, i64 %v) { ;; isn't violated. define i64 @nofold_fence(ptr %p) { -; CHECK-O0-LABEL: nofold_fence: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: movq (%rdi), %rax -; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) -; CHECK-O0-NEXT: addq $15, %rax -; CHECK-O0-NEXT: retq -; -; CHECK-O3-LABEL: nofold_fence: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: movq (%rdi), %rax -; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) -; CHECK-O3-NEXT: addq $15, %rax -; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: nofold_fence: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: mfence -; CHECK-MFENCE-NEXT: addq $15, %rax -; CHECK-MFENCE-NEXT: retq +; CHECK-LABEL: nofold_fence: +; CHECK: # %bb.0: +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: addq $15, %rax +; CHECK-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 fence seq_cst %ret = add i64 %v, 15 @@ -2794,12 +2148,6 @@ define i64 @fold_constant(i64 %arg) { ; CHECK-O3-NEXT: movq %rdi, %rax ; CHECK-O3-NEXT: addq Constant(%rip), %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: fold_constant: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq %rdi, %rax -; CHECK-MFENCE-NEXT: addq Constant(%rip), %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr @Constant unordered, align 8 %ret = add i64 %v, %arg ret i64 %ret @@ -2819,26 +2167,12 @@ define i64 @fold_constant_clobber(ptr %p, i64 %arg) { } define i64 @fold_constant_fence(i64 %arg) { -; CHECK-O0-LABEL: fold_constant_fence: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: movq Constant(%rip), %rax -; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) -; CHECK-O0-NEXT: addq %rdi, %rax -; CHECK-O0-NEXT: retq -; -; CHECK-O3-LABEL: fold_constant_fence: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: movq Constant(%rip), %rax -; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) -; CHECK-O3-NEXT: addq %rdi, %rax -; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: fold_constant_fence: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq Constant(%rip), %rax -; CHECK-MFENCE-NEXT: mfence -; CHECK-MFENCE-NEXT: addq %rdi, %rax -; CHECK-MFENCE-NEXT: retq +; CHECK-LABEL: fold_constant_fence: +; CHECK: # %bb.0: +; CHECK-NEXT: movq Constant(%rip), %rax +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: addq %rdi, %rax +; CHECK-NEXT: retq %v = load atomic i64, ptr @Constant unordered, align 8 fence seq_cst %ret = add i64 %v, %arg @@ -2860,26 +2194,12 @@ define i64 @fold_invariant_clobber(ptr dereferenceable(8) %p, i64 %arg) { define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) { -; CHECK-O0-LABEL: fold_invariant_fence: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: movq (%rdi), %rax -; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) -; CHECK-O0-NEXT: addq %rsi, %rax -; CHECK-O0-NEXT: retq -; -; CHECK-O3-LABEL: fold_invariant_fence: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: movq (%rdi), %rax -; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) -; CHECK-O3-NEXT: addq %rsi, %rax -; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: fold_invariant_fence: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movq (%rdi), %rax -; CHECK-MFENCE-NEXT: mfence -; CHECK-MFENCE-NEXT: addq %rsi, %rax -; CHECK-MFENCE-NEXT: retq +; CHECK-LABEL: fold_invariant_fence: +; CHECK: # %bb.0: +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{} fence seq_cst %ret = add i64 %v, %arg @@ -2902,12 +2222,6 @@ define i16 @load_i8_anyext_i16(ptr %ptr) { ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_i8_anyext_i16: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax -; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 2 %vec = insertelement <2 x i8> undef, i8 %v, i32 0 %res = bitcast <2 x i8> %vec to i16 @@ -2925,11 +2239,6 @@ define i32 @load_i8_anyext_i32(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_i8_anyext_i32: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 4 %vec = insertelement <4 x i8> undef, i8 %v, i32 0 %res = bitcast <4 x i8> %vec to i32 @@ -2948,11 +2257,6 @@ define i32 @load_i16_anyext_i32(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzwl (%rdi), %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_i16_anyext_i32: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax -; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 4 %vec = insertelement <2 x i16> undef, i16 %v, i64 0 %res = bitcast <2 x i16> %vec to i32 @@ -2975,13 +2279,6 @@ define i64 @load_i16_anyext_i64(ptr %ptr) { ; CHECK-O3-NEXT: vmovd %eax, %xmm0 ; CHECK-O3-NEXT: vmovq %xmm0, %rax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_i16_anyext_i64: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax -; CHECK-MFENCE-NEXT: vmovd %eax, %xmm0 -; CHECK-MFENCE-NEXT: vmovq %xmm0, %rax -; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 8 %vec = insertelement <4 x i16> undef, i16 %v, i64 0 %res = bitcast <4 x i16> %vec to i64 @@ -3010,15 +2307,6 @@ define i16 @load_combine(ptr %p) { ; CHECK-O3-NEXT: orl %ecx, %eax ; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: load_combine: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movzbl (%rdi), %ecx -; CHECK-MFENCE-NEXT: movzbl 1(%rdi), %eax -; CHECK-MFENCE-NEXT: shll $8, %eax -; CHECK-MFENCE-NEXT: orl %ecx, %eax -; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-MFENCE-NEXT: retq %v1 = load atomic i8, ptr %p unordered, align 2 %p2 = getelementptr i8, ptr %p, i64 1 %v2 = load atomic i8, ptr %p2 unordered, align 1 @@ -3056,19 +2344,6 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O3-NEXT: .LBB116_2: # %untaken ; CHECK-O3-NEXT: xorl %eax, %eax ; CHECK-O3-NEXT: retq -; -; CHECK-MFENCE-LABEL: fold_cmp_over_fence: -; CHECK-MFENCE: # %bb.0: -; CHECK-MFENCE-NEXT: movl (%rdi), %eax -; CHECK-MFENCE-NEXT: mfence -; CHECK-MFENCE-NEXT: cmpl %eax, %esi -; CHECK-MFENCE-NEXT: jne .LBB116_2 -; CHECK-MFENCE-NEXT: # %bb.1: # %taken -; CHECK-MFENCE-NEXT: movb $1, %al -; CHECK-MFENCE-NEXT: retq -; CHECK-MFENCE-NEXT: .LBB116_2: # %untaken -; CHECK-MFENCE-NEXT: xorl %eax, %eax -; CHECK-MFENCE-NEXT: retq %v2 = load atomic i32, ptr %p unordered, align 4 fence seq_cst %cmp = icmp eq i32 %v1, %v2 diff --git a/llvm/test/CodeGen/X86/implicit-null-check.ll b/llvm/test/CodeGen/X86/implicit-null-check.ll index fc81f703f5d40..de63c9ae209df 100644 --- a/llvm/test/CodeGen/X86/implicit-null-check.ll +++ b/llvm/test/CodeGen/X86/implicit-null-check.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -verify-machineinstrs -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s define i32 @imp_null_check_load(ptr %x) { @@ -465,7 +466,7 @@ define i32 @imp_null_check_load_fence2(ptr %x) { ; CHECK-NEXT: testq %rdi, %rdi ; CHECK-NEXT: je LBB17_1 ; CHECK-NEXT: ## %bb.2: ## %not_null -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl (%rdi), %eax ; CHECK-NEXT: retq ; CHECK-NEXT: LBB17_1: ## %is_null diff --git a/llvm/test/CodeGen/X86/membarrier.ll b/llvm/test/CodeGen/X86/membarrier.ll index 55f2a2f210139..2773f01f7ab82 100644 --- a/llvm/test/CodeGen/X86/membarrier.ll +++ b/llvm/test/CodeGen/X86/membarrier.ll @@ -6,9 +6,9 @@ define i32 @t() { ; CHECK-LABEL: t: ; CHECK: # %bb.0: ; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: lock decl -{{[0-9]+}}(%rsp) -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq %i = alloca i32, align 4 diff --git a/llvm/test/CodeGen/X86/mfence.ll b/llvm/test/CodeGen/X86/mfence.ll index 1be1e4793461b..ce74d2dd69f9b 100644 --- a/llvm/test/CodeGen/X86/mfence.ll +++ b/llvm/test/CodeGen/X86/mfence.ll @@ -1,15 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefixes=CHECK,X64 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+avoid-mfence | FileCheck %s --check-prefix=X64-NO-MFENCE ; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence. define void @test() { -; CHECK-LABEL: test: -; CHECK: # %bb.0: -; CHECK-NEXT: mfence -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: test: +; X86: # %bb.0: +; X86-NEXT: mfence +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # %bb.0: +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq fence seq_cst ret void } @@ -24,15 +28,9 @@ define i32 @fence(ptr %ptr) { ; ; X64-LABEL: fence: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq -; -; X64-NO-MFENCE-LABEL: fence: -; X64-NO-MFENCE: # %bb.0: -; X64-NO-MFENCE-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) -; X64-NO-MFENCE-NEXT: movl (%rdi), %eax -; X64-NO-MFENCE-NEXT: retq %atomic = atomicrmw add ptr %ptr, i32 0 seq_cst ret i32 %atomic } @@ -43,15 +41,10 @@ define void @mfence() nounwind { ; X32-NEXT: mfence ; X32-NEXT: retl ; -; X64-LABEL: mfence: -; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: retq -; -; X64-NO-MFENCE-LABEL: mfence: -; X64-NO-MFENCE: # %bb.0: -; X64-NO-MFENCE-NEXT: mfence -; X64-NO-MFENCE-NEXT: retq +; CHECK-LABEL: mfence: +; CHECK: # %bb.0: +; CHECK-NEXT: mfence +; CHECK-NEXT: ret{{[l|q]}} call void @llvm.x86.sse2.mfence() ret void } From aef29a300549c031cba802c2469fceb4190319bb Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 11 Mar 2025 11:55:47 +0100 Subject: [PATCH 3/3] fixup! don't use tuning for avoid mfence --- llvm/lib/Target/X86/X86.td | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index aa59d9a70a212..38761e1fd7eec 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -772,7 +772,6 @@ def TuningUseGLMDivSqrtCosts def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true", "Target has branch hint feature">; - //===----------------------------------------------------------------------===// // X86 CPU Families // TODO: Remove these - use general tuning features to determine codegen. @@ -1269,8 +1268,7 @@ def ProcessorFeatures { // Tremont list TRMAdditionalFeatures = [FeatureCLWB, FeatureGFNI]; - list TRMAdditionalTuning = []; - list TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning); + list TRMTuning = GLPTuning; list TRMFeatures = !listconcat(GLPFeatures, TRMAdditionalFeatures); @@ -1742,7 +1740,7 @@ def : ProcModel; } foreach P = ["penryn", "core_2_duo_sse4_1"] in {