diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 7aa1fadd10dfc..e01fbcbcf7761 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1164,6 +1164,43 @@ SDValue SelectionDAGBuilder::getMemoryRoot() { return updateRoot(PendingLoads); } +SDValue SelectionDAGBuilder::getFPOperationRoot(fp::ExceptionBehavior EB) { + // If the new exception behavior differs from that of the pending + // ones, chain up them and update the root. + switch (EB) { + case fp::ExceptionBehavior::ebMayTrap: + case fp::ExceptionBehavior::ebIgnore: + // Floating-point exceptions produced by such operations are not intended + // to be observed, so the sequence of these operations does not need to be + // preserved. + // + // They however must not be mixed with the instructions that have strict + // exception behavior. Placing an operation with 'ebIgnore' behavior between + // 'ebStrict' operations could distort the observed exception behavior. + if (!PendingConstrainedFPStrict.empty()) { + assert(PendingConstrainedFP.empty()); + updateRoot(PendingConstrainedFPStrict); + } + break; + case fp::ExceptionBehavior::ebStrict: + // Floating-point exception produced by these operations may be observed, so + // they must be correctly chained. If trapping on FP exceptions is + // disabled, the exceptions can be observed only by functions that read + // exception flags, like 'llvm.get_fpenv' or 'fetestexcept'. It means that + // the order of operations is not significant between barriers. + // + // If trapping is enabled, each operation becomes an implicit observation + // point, so the operations must be sequenced according their original + // source order. + if (!PendingConstrainedFP.empty()) { + assert(PendingConstrainedFPStrict.empty()); + updateRoot(PendingConstrainedFP); + } + // TODO: Add support for trapping-enabled scenarios. + } + return DAG.getRoot(); +} + SDValue SelectionDAGBuilder::getRoot() { // Chain up all pending constrained intrinsics together with all // pending loads, by simply appending them to PendingLoads and @@ -8280,6 +8317,30 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } } +void SelectionDAGBuilder::pushFPOpOutChain(SDValue Result, + fp::ExceptionBehavior EB) { + assert(Result.getNode()->getNumValues() == 2); + SDValue OutChain = Result.getValue(1); + assert(OutChain.getValueType() == MVT::Other); + + // Instead of updating the root immediately, push the produced chain to the + // appropriate list, deferring the update until the root is requested. In this + // case, the nodes from the lists are chained using TokenFactor, indicating + // that the operations are independent. + // + // In particular, the root is updated before any call that might access the + // floating-point environment, except for constrained intrinsics. + switch (EB) { + case fp::ExceptionBehavior::ebMayTrap: + case fp::ExceptionBehavior::ebIgnore: + PendingConstrainedFP.push_back(OutChain); + break; + case fp::ExceptionBehavior::ebStrict: + PendingConstrainedFPStrict.push_back(OutChain); + break; + } +} + void SelectionDAGBuilder::visitConstrainedFPIntrinsic( const ConstrainedFPIntrinsic &FPI) { SDLoc sdl = getCurSDLoc(); @@ -8287,42 +8348,16 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( // We do not need to serialize constrained FP intrinsics against // each other or against (nonvolatile) loads, so they can be // chained like loads. - SDValue Chain = DAG.getRoot(); + fp::ExceptionBehavior EB = *FPI.getExceptionBehavior(); + SDValue Chain = getFPOperationRoot(EB); SmallVector Opers; Opers.push_back(Chain); for (unsigned I = 0, E = FPI.getNonMetadataArgCount(); I != E; ++I) Opers.push_back(getValue(FPI.getArgOperand(I))); - auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) { - assert(Result.getNode()->getNumValues() == 2); - - // Push node to the appropriate list so that future instructions can be - // chained up correctly. - SDValue OutChain = Result.getValue(1); - switch (EB) { - case fp::ExceptionBehavior::ebIgnore: - // The only reason why ebIgnore nodes still need to be chained is that - // they might depend on the current rounding mode, and therefore must - // not be moved across instruction that may change that mode. - [[fallthrough]]; - case fp::ExceptionBehavior::ebMayTrap: - // These must not be moved across calls or instructions that may change - // floating-point exception masks. - PendingConstrainedFP.push_back(OutChain); - break; - case fp::ExceptionBehavior::ebStrict: - // These must not be moved across calls or instructions that may change - // floating-point exception masks or read floating-point exception flags. - // In addition, they cannot be optimized out even if unused. - PendingConstrainedFPStrict.push_back(OutChain); - break; - } - }; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = TLI.getValueType(DAG.getDataLayout(), FPI.getType()); SDVTList VTs = DAG.getVTList(VT, MVT::Other); - fp::ExceptionBehavior EB = *FPI.getExceptionBehavior(); SDNodeFlags Flags; if (EB == fp::ExceptionBehavior::ebIgnore) @@ -8346,7 +8381,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { Opers.pop_back(); SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers, Flags); - pushOutChain(Mul, EB); + pushFPOpOutChain(Mul, EB); Opcode = ISD::STRICT_FADD; Opers.clear(); Opers.push_back(Mul.getValue(1)); @@ -8377,7 +8412,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( } SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers, Flags); - pushOutChain(Result, EB); + pushFPOpOutChain(Result, EB); SDValue FPResult = Result.getValue(0); setValue(&FPI, FPResult); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index e0835e6310357..9b2afb8e78651 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -195,6 +195,11 @@ class SelectionDAGBuilder { /// Update root to include all chains from the Pending list. SDValue updateRoot(SmallVectorImpl &Pending); + /// Given a node representing a floating-point operation and its specified + /// exception behavior, this either updates the root or stores the node in + /// a list to be added to chains latter. + void pushFPOpOutChain(SDValue Result, fp::ExceptionBehavior EB); + /// A unique monotonically increasing number used to order the SDNodes we /// create. unsigned SDNodeOrder; @@ -300,6 +305,13 @@ class SelectionDAGBuilder { /// memory node that may need to be ordered after any prior load instructions. SDValue getMemoryRoot(); + /// Return the current virtual root of the Selection DAG, flushing + /// PendingConstrainedFP or PendingConstrainedFPStrict items if the new + /// exception behavior (specified by \p EB) differs from that of the pending + /// instructions. This must be done before emitting constrained FP operation + /// call. + SDValue getFPOperationRoot(fp::ExceptionBehavior EB); + /// Similar to getMemoryRoot, but also flushes PendingConstrainedFP(Strict) /// items. This must be done before emitting any call other any other node /// that may need to be ordered after FP instructions due to other side diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll index bd32430fcedbd..fc5279d02ab8a 100644 --- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll @@ -100,40 +100,52 @@ entry: ret i32 %result } -; These 2 divs only differ in their exception behavior and will be CSEd. Make -; sure the nofpexcept flag is not set on the combined node. +; These 4 divs only differ in their exception behavior. They form two groups, +; whithin each the constrained functions have the same exception hehavior and +; may be CSE'd. Instructions with different exception behavior belong to +; different groups, they have different chain argument and cannot be CSE'd. define void @binop_cse(double %a, double %b, ptr %x, ptr %y) #0 { entry: ; CHECK-LABEL: name: binop_cse -; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0) -; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16) -; CHECK: [[MOVSDrm_alt:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16) -; CHECK: %3:fr64 = DIVSDrm [[MOVSDrm_alt]], %fixed-stack.2, 1, $noreg, 0, $noreg, implicit $mxcsr :: (load (s64) from %fixed-stack.2) -; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.x, align 4) -; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.y, align 4) +; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0) +; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16) +; CHECK: [[B:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2) +; CHECK: [[A:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16) +; CHECK: [[DIV0:%[0-9]+]]:fr64 = DIVSDrr [[A]], [[B]], implicit $mxcsr +; CHECK: [[DIV1:%[0-9]+]]:fr64 = nofpexcept DIVSDrr [[A]], [[B]], implicit $mxcsr +; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.x, align 4) +; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.y, align 4) ; CHECK: RET 0 %div = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + %div1 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 - store double %div, ptr %x - store double %div2, ptr %y + %div3 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + store double %div2, ptr %x + store double %div3, ptr %y ret void } -; These 2 sitofps only differ in their exception behavior and will be CSEd. Make -; sure the nofpexcept flag is not set on the combined node. +; These 4 divs only differ in their exception behavior. They form two groups, +; whithin each the constrained functions have the same exception hehavior and +; may be CSE'd. Instructions with different exception behavior belong to +; different groups, they have different chain argument and cannot be CSE'd. define void @sitofp_cse(i32 %a, ptr %x, ptr %y) #0 { entry: ; CHECK-LABEL: name: sitofp_cse -; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8) -; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1) -; CHECK: %2:fr64 = CVTSI2SDrm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16) -; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.x, align 4) -; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.y, align 4) +; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8) +; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1) +; CHECK: [[A:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16) +; CHECK: [[CVT0:%[0-9]+]]:fr64 = CVTSI2SDrr [[A]] +; CHECK: [[CVT1:%[0-9]+]]:fr64 = nofpexcept CVTSI2SDrr [[A]] +; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.x, align 4) +; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.y, align 4) ; CHECK: RET 0 %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + %result1 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %result2 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 - store double %result, ptr %x - store double %result2, ptr %y + %result3 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + store double %result2, ptr %x + store double %result3, ptr %y ret void }