From bf554507f71a0661a56126dd784659d62603fd65 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 28 Oct 2024 14:20:47 +0000 Subject: [PATCH 1/3] [AArch64] Avoid streaming mode hazards from FP constant stores Currently, several places will replace stores of floating-point constants with equivalent integer stores (and constant generation). This is generally beneficial. However, in streaming functions on some SME devices, this could result in a streaming mode memory hazard. This patch adds a TLI hook `.canUseIntLoadStoreForFloatValues()` and implements it for AArch64 to prevent these transformations in streaming functions if a hazard is possible. --- llvm/include/llvm/CodeGen/TargetLowering.h | 4 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 + llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 75 ++++---- .../Target/AArch64/AArch64ISelLowering.cpp | 19 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 + .../sve-streaming-mode-fp-constant-stores.ll | 171 ++++++++++++++++++ 6 files changed, 237 insertions(+), 37 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-fp-constant-stores.ll diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 8e0cdc6f1a5e7..c8f4436111ba7 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -489,6 +489,10 @@ class TargetLoweringBase { return true; } + /// Returns true if a floating-point load or store can be replaced with an + /// equivalent integer load or store without negatively affecting performance. + virtual bool canUseIntLoadStoreForFloatValues() const { return true; } + /// Return true if it is profitable to convert a select of FP constants into /// a constant pool load whose address depends on the select condition. The /// parameter may be used to differentiate a select with FP compare from diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b800204d91750..ec3a7bc5eae0e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21587,6 +21587,9 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { // processor operation but an i64 (which is not legal) requires two. So the // transform should not be done in this case. + if (!TLI.canUseIntLoadStoreForFloatValues()) + return SDValue(); + SDValue Tmp; switch (CFP->getSimpleValueType(0).SimpleTy) { default: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 6ba12cfb8c514..60aa5f1bd2fe0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -430,45 +430,50 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) { if (Value.getOpcode() == ISD::TargetConstantFP) return SDValue(); - if (ConstantFPSDNode *CFP = dyn_cast(Value)) { - if (CFP->getValueType(0) == MVT::f32 && - TLI.isTypeLegal(MVT::i32)) { - SDValue Con = DAG.getConstant(CFP->getValueAPF(). - bitcastToAPInt().zextOrTrunc(32), - SDLoc(CFP), MVT::i32); - return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); - } + ConstantFPSDNode *CFP = dyn_cast(Value); + if (!CFP) + return SDValue(); - if (CFP->getValueType(0) == MVT::f64 && - !TLI.isFPImmLegal(CFP->getValueAPF(), MVT::f64)) { - // If this target supports 64-bit registers, do a single 64-bit store. - if (TLI.isTypeLegal(MVT::i64)) { - SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). - zextOrTrunc(64), SDLoc(CFP), MVT::i64); - return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); - } + if (!TLI.canUseIntLoadStoreForFloatValues()) + return SDValue(); - if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) { - // Otherwise, if the target supports 32-bit registers, use 2 32-bit - // stores. If the target supports neither 32- nor 64-bits, this - // xform is certainly not worth it. - const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt(); - SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32); - SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32); - if (DAG.getDataLayout().isBigEndian()) - std::swap(Lo, Hi); - - Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); - Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(4), dl); - Hi = DAG.getStore(Chain, dl, Hi, Ptr, - ST->getPointerInfo().getWithOffset(4), + EVT FloatTy = CFP->getValueType(0); + if (FloatTy == MVT::f32 && TLI.isTypeLegal(MVT::i32)) { + SDValue Con = + DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(32), + SDLoc(CFP), MVT::i32); + return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), + ST->getOriginalAlign(), MMOFlags, AAInfo); + } + + if (FloatTy == MVT::f64 && !TLI.isFPImmLegal(CFP->getValueAPF(), MVT::f64)) { + // If this target supports 64-bit registers, do a single 64-bit store. + if (TLI.isTypeLegal(MVT::i64)) { + SDValue Con = + DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(64), + SDLoc(CFP), MVT::i64); + return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), ST->getOriginalAlign(), MMOFlags, AAInfo); + } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); - } + if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) { + // Otherwise, if the target supports 32-bit registers, use 2 32-bit + // stores. If the target supports neither 32- nor 64-bits, this xform is + // certainly not worth it. + const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt(); + SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32); + SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32); + if (DAG.getDataLayout().isBigEndian()) + std::swap(Lo, Hi); + + Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), + ST->getOriginalAlign(), MMOFlags, AAInfo); + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(4), dl); + Hi = DAG.getStore(Chain, dl, Hi, Ptr, + ST->getPointerInfo().getWithOffset(4), + ST->getOriginalAlign(), MMOFlags, AAInfo); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); } } return SDValue(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 31a720ed7b5c7..bdabfafd43976 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22514,7 +22514,8 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { /// movi v0.2d, #0 /// str q0, [x0] /// -static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { +static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St, + AArch64Subtarget const &Subtarget) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); @@ -22522,6 +22523,13 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { if (VT.isScalableVector()) return SDValue(); + // Do not replace the FP store when it could result in a streaming memory + // hazard. + if (VT.getVectorElementType().isFloatingPoint() && + Subtarget.getStreamingHazardSize() > 0 && + (Subtarget.isStreaming() || Subtarget.isStreamingCompatible())) + return SDValue(); + // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or // 2, 3 or 4 i32 elements. int NumVecElts = VT.getVectorNumElements(); @@ -22651,7 +22659,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // If we get a splat of zeros, convert this vector store to a store of // scalars. They will be merged into store pairs of xzr thereby removing one // instruction and one register. - if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) + if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S, *Subtarget)) return ReplacedZeroSplat; // FIXME: The logic for deciding if an unaligned store should be split should @@ -27706,6 +27714,13 @@ bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, return !NoFloat || MemVT.getSizeInBits() <= 64; } +bool AArch64TargetLowering::canUseIntLoadStoreForFloatValues() const { + // Avoid replacing FP loads/stores with integer ones when it could result in a + // streaming memory hazard. + return !Subtarget->getStreamingHazardSize() || + (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()); +} + bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { // We want inc-of-add for scalars and sub-of-not for vectors. return VT.isScalarInteger(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index d696355bb062a..b77bb29144713 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -840,6 +840,8 @@ class AArch64TargetLowering : public TargetLowering { bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override; + bool canUseIntLoadStoreForFloatValues() const override; + bool isCheapToSpeculateCttz(Type *) const override { return true; } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fp-constant-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fp-constant-stores.ll new file mode 100644 index 0000000000000..c060bc0a901ea --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fp-constant-stores.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-streaming-hazard-size=64 -force-streaming-compatible -mattr=+sve < %s | FileCheck %s +; RUN: llc -aarch64-streaming-hazard-size=64 -force-streaming -mattr=+sme < %s | FileCheck %s +; RUN: llc -force-streaming -mattr=+sme < %s | FileCheck %s --check-prefix=NOHAZARD + +target triple = "aarch64-unknown-linux-gnu" + +; This test checks that in streaming[-compatible] functions if there could be +; a hazard between GPR and FPR memory operations, then integer stores are not +; used for floating-point constants. + +define void @"store_f64_0.0"(ptr %num) { +; CHECK-LABEL: store_f64_0.0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, xzr +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_f64_0.0: +; NOHAZARD: // %bb.0: // %entry +; NOHAZARD-NEXT: str xzr, [x0] +; NOHAZARD-NEXT: ret +entry: + store double 0.000000e+00, ptr %num, align 8 + ret void +} + +define void @"store_f64_1.0"(ptr %num) { +; CHECK-LABEL: store_f64_1.0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, #1.00000000 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_f64_1.0: +; NOHAZARD: // %bb.0: // %entry +; NOHAZARD-NEXT: mov x8, #4607182418800017408 // =0x3ff0000000000000 +; NOHAZARD-NEXT: str x8, [x0] +; NOHAZARD-NEXT: ret +entry: + store double 1.000000e+00, ptr %num, align 8 + ret void +} + +define void @"store_f64_1.23456789"(ptr %num) { +; CHECK-LABEL: store_f64_1.23456789: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_f64_1.23456789: +; NOHAZARD: // %bb.0: // %entry +; NOHAZARD-NEXT: mov x8, #56859 // =0xde1b +; NOHAZARD-NEXT: movk x8, #17027, lsl #16 +; NOHAZARD-NEXT: movk x8, #49354, lsl #32 +; NOHAZARD-NEXT: movk x8, #16371, lsl #48 +; NOHAZARD-NEXT: str x8, [x0] +; NOHAZARD-NEXT: ret +entry: + store double 0x3FF3C0CA4283DE1B, ptr %num, align 8 + ret void +} + +define void @"store_f32_0.0"(ptr %num) { +; CHECK-LABEL: store_f32_0.0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, wzr +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_f32_0.0: +; NOHAZARD: // %bb.0: // %entry +; NOHAZARD-NEXT: str wzr, [x0] +; NOHAZARD-NEXT: ret +entry: + store float 0.000000e+00, ptr %num, align 4 + ret void +} + +define void @"store_f32_1.0"(ptr %num) { +; CHECK-LABEL: store_f32_1.0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, #1.00000000 +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_f32_1.0: +; NOHAZARD: // %bb.0: // %entry +; NOHAZARD-NEXT: mov w8, #1065353216 // =0x3f800000 +; NOHAZARD-NEXT: str w8, [x0] +; NOHAZARD-NEXT: ret +entry: + store float 1.000000e+00, ptr %num, align 4 + ret void +} + +define void @"store_f32_1.23456789"(ptr %num) { +; CHECK-LABEL: store_f32_1.23456789: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1618 // =0x652 +; CHECK-NEXT: movk w8, #16286, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_f32_1.23456789: +; NOHAZARD: // %bb.0: // %entry +; NOHAZARD-NEXT: mov w8, #1618 // =0x652 +; NOHAZARD-NEXT: movk w8, #16286, lsl #16 +; NOHAZARD-NEXT: str w8, [x0] +; NOHAZARD-NEXT: ret +entry: + store float 0x3FF3C0CA40000000, ptr %num, align 4 + ret void +} + +define void @"store_v4f32_0.0"(ptr %num) { +; CHECK-LABEL: store_v4f32_0.0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_v4f32_0.0: +; NOHAZARD: // %bb.0: // %entry +; NOHAZARD-NEXT: stp xzr, xzr, [x0] +; NOHAZARD-NEXT: ret +entry: + store <4 x float> zeroinitializer, ptr %num, align 16 + ret void +} + +define void @"store_v4f32_1.0"(ptr %num) { +; CHECK-LABEL: store_v4f32_1.0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov z0.s, #1.00000000 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_v4f32_1.0: +; NOHAZARD: // %bb.0: // %entry +; NOHAZARD-NEXT: fmov z0.s, #1.00000000 +; NOHAZARD-NEXT: str q0, [x0] +; NOHAZARD-NEXT: ret +entry: + store <4 x float> , ptr %num, align 16 + ret void +} + +define void @"store_v4f32_1.23456789"(ptr %num) { +; CHECK-LABEL: store_v4f32_1.23456789: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1618 // =0x652 +; CHECK-NEXT: movk w8, #16286, lsl #16 +; CHECK-NEXT: mov z0.s, w8 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_v4f32_1.23456789: +; NOHAZARD: // %bb.0: // %entry +; NOHAZARD-NEXT: mov w8, #1618 // =0x652 +; NOHAZARD-NEXT: movk w8, #16286, lsl #16 +; NOHAZARD-NEXT: mov z0.s, w8 +; NOHAZARD-NEXT: str q0, [x0] +; NOHAZARD-NEXT: ret +entry: + store <4 x float> , ptr %num, align 16 + ret void +} From 500090b469a28ebe39595e6b01d74afe9a4108a9 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 30 Oct 2024 13:16:05 +0000 Subject: [PATCH 2/3] Tidy diff --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 72 +++++++++---------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 60aa5f1bd2fe0..b5423aea9adf0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -430,50 +430,48 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) { if (Value.getOpcode() == ISD::TargetConstantFP) return SDValue(); - ConstantFPSDNode *CFP = dyn_cast(Value); - if (!CFP) - return SDValue(); - if (!TLI.canUseIntLoadStoreForFloatValues()) return SDValue(); - EVT FloatTy = CFP->getValueType(0); - if (FloatTy == MVT::f32 && TLI.isTypeLegal(MVT::i32)) { - SDValue Con = - DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(32), - SDLoc(CFP), MVT::i32); - return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); - } - - if (FloatTy == MVT::f64 && !TLI.isFPImmLegal(CFP->getValueAPF(), MVT::f64)) { - // If this target supports 64-bit registers, do a single 64-bit store. - if (TLI.isTypeLegal(MVT::i64)) { - SDValue Con = - DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(64), - SDLoc(CFP), MVT::i64); + if (ConstantFPSDNode *CFP = dyn_cast(Value)) { + if (CFP->getValueType(0) == MVT::f32 && + TLI.isTypeLegal(MVT::i32)) { + SDValue Con = DAG.getConstant(CFP->getValueAPF(). + bitcastToAPInt().zextOrTrunc(32), + SDLoc(CFP), MVT::i32); return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), ST->getOriginalAlign(), MMOFlags, AAInfo); } - if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) { - // Otherwise, if the target supports 32-bit registers, use 2 32-bit - // stores. If the target supports neither 32- nor 64-bits, this xform is - // certainly not worth it. - const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt(); - SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32); - SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32); - if (DAG.getDataLayout().isBigEndian()) - std::swap(Lo, Hi); - - Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); - Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(4), dl); - Hi = DAG.getStore(Chain, dl, Hi, Ptr, - ST->getPointerInfo().getWithOffset(4), - ST->getOriginalAlign(), MMOFlags, AAInfo); - - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); + if (CFP->getValueType(0) == MVT::f64 && + !TLI.isFPImmLegal(CFP->getValueAPF(), MVT::f64)) { + // If this target supports 64-bit registers, do a single 64-bit store. + if (TLI.isTypeLegal(MVT::i64)) { + SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). + zextOrTrunc(64), SDLoc(CFP), MVT::i64); + return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), + ST->getOriginalAlign(), MMOFlags, AAInfo); + } + + if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) { + // Otherwise, if the target supports 32-bit registers, use 2 32-bit + // stores. If the target supports neither 32- nor 64-bits, this + // xform is certainly not worth it. + const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt(); + SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32); + SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32); + if (DAG.getDataLayout().isBigEndian()) + std::swap(Lo, Hi); + + Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), + ST->getOriginalAlign(), MMOFlags, AAInfo); + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(4), dl); + Hi = DAG.getStore(Chain, dl, Hi, Ptr, + ST->getPointerInfo().getWithOffset(4), + ST->getOriginalAlign(), MMOFlags, AAInfo); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); + } } } return SDValue(); From 5b2808325e77f71f78eaa50f42f43f6e682a5aaa Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 30 Oct 2024 14:00:06 +0000 Subject: [PATCH 3/3] Fixups --- .../Target/AArch64/AArch64ISelLowering.cpp | 12 +++++------- .../sve-streaming-mode-fp-constant-stores.ll | 19 +++++++++++++++++-- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bdabfafd43976..1c83c7231d767 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22514,8 +22514,7 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { /// movi v0.2d, #0 /// str q0, [x0] /// -static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St, - AArch64Subtarget const &Subtarget) { +static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); @@ -22523,11 +22522,10 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St, if (VT.isScalableVector()) return SDValue(); - // Do not replace the FP store when it could result in a streaming memory + // Do not replace the vector store when it could result in a streaming memory // hazard. - if (VT.getVectorElementType().isFloatingPoint() && - Subtarget.getStreamingHazardSize() > 0 && - (Subtarget.isStreaming() || Subtarget.isStreamingCompatible())) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.canUseIntLoadStoreForFloatValues()) return SDValue(); // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or @@ -22659,7 +22657,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // If we get a splat of zeros, convert this vector store to a store of // scalars. They will be merged into store pairs of xzr thereby removing one // instruction and one register. - if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S, *Subtarget)) + if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) return ReplacedZeroSplat; // FIXME: The logic for deciding if an unaligned store should be split should diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fp-constant-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fp-constant-stores.ll index c060bc0a901ea..1890df2d8d805 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fp-constant-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fp-constant-stores.ll @@ -25,6 +25,21 @@ entry: ret void } +; (Integer types still use GPR stores) +define void @store_i64_0(ptr %num) { +; CHECK-LABEL: store_i64_0: +; CHECK: // %bb.0: +; CHECK-NEXT: str xzr, [x0] +; CHECK-NEXT: ret +; +; NOHAZARD-LABEL: store_i64_0: +; NOHAZARD: // %bb.0: +; NOHAZARD-NEXT: str xzr, [x0] +; NOHAZARD-NEXT: ret + store i64 0, ptr %num, align 8 + ret void +} + define void @"store_f64_1.0"(ptr %num) { ; CHECK-LABEL: store_f64_1.0: ; CHECK: // %bb.0: // %entry @@ -45,8 +60,8 @@ entry: define void @"store_f64_1.23456789"(ptr %num) { ; CHECK-LABEL: store_f64_1.23456789: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret ;