From d2d0597f9ee1100662c21bcefbf6331172bb287c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 22 Apr 2025 17:04:22 +0100 Subject: [PATCH 1/4] [DAG] shouldReduceLoadWidth - add optional byte offset argument Based off feedback for #129695 - we need to be able to determine the load offset of smaller loads when trying to determine whether a multiple use load should be split (in particular for AVX subvector extractions). This patch adds a std::optional ByteOffset argument to shouldReduceLoadWidth calls for where we know the constant offset to allow targets to make use of it in future patches. --- llvm/include/llvm/CodeGen/TargetLowering.h | 6 ++++-- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 +++++----- .../CodeGen/SelectionDAG/TargetLowering.cpp | 20 +++++++++++-------- .../Target/AArch64/AArch64ISelLowering.cpp | 9 +++++---- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 ++-- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 ++++---- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 5 ++--- llvm/lib/Target/BPF/BPFISelLowering.h | 5 +++-- .../Target/Hexagon/HexagonISelLowering.cpp | 12 ++++++----- llvm/lib/Target/Hexagon/HexagonISelLowering.h | 4 ++-- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++--- llvm/lib/Target/X86/X86ISelLowering.h | 8 ++++++-- 12 files changed, 56 insertions(+), 42 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 00c36266a069f..818058c5bf8e3 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1824,8 +1824,10 @@ class TargetLoweringBase { /// Return true if it is profitable to reduce a load to a smaller type. /// Example: (i16 (trunc (i32 (load x))) -> i16 load x - virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, - EVT NewVT) const { + /// Example: (i16 (trunc (srl (i32 (load x)), 16)) -> i16 load x+2 + virtual bool shouldReduceLoadWidth( + SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset = std::nullopt) const { // By default, assume that it is cheaper to extract a subvector from a wide // vector load rather than creating multiple narrow vector loads. if (NewVT.isVector() && !SDValue(Load, 0).hasOneUse()) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3f3f87d1f5658..f03a9617da003 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6690,7 +6690,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT)) return false; - if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT)) + if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT, /*ByteOffset=*/0)) return false; return true; @@ -6701,9 +6701,11 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, unsigned ShAmt) { if (!LDST) return false; + // Only allow byte offsets. if (ShAmt % 8) return false; + const unsigned ByteShAmt = ShAmt / 8; // Do not generate loads of non-round integer types since these can // be expensive (and would be wrong if the type is not byte sized). @@ -6727,8 +6729,6 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, // Ensure that this isn't going to produce an unsupported memory access. if (ShAmt) { - assert(ShAmt % 8 == 0 && "ShAmt is byte offset"); - const unsigned ByteShAmt = ShAmt / 8; const Align LDSTAlign = LDST->getAlign(); const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt); if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, @@ -6768,7 +6768,7 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt) return false; - if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT)) + if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT, ByteShAmt)) return false; } else { assert(isa(LDST) && "It is not a Load nor a Store SDNode"); @@ -25268,7 +25268,8 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { TypeSize Offset = VT.getStoreSize() * (Index / NumElts); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT)) + if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT, + /*ByteOffset=*/Offset.getFixedValue())) return SDValue(); // The narrow load will be offset from the base address of the old load if diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 3995216e3d689..bd85529a7d076 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4726,8 +4726,6 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // for the narrowed load. for (unsigned width = 8; width < origWidth; width *= 2) { EVT newVT = EVT::getIntegerVT(*DAG.getContext(), width); - if (!shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) - continue; APInt newMask = APInt::getLowBitsSet(maskWidth, width); // Avoid accessing any padding here for now (we could use memWidth // instead of origWidth here otherwise). @@ -4737,8 +4735,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, unsigned ptrOffset = Layout.isLittleEndian() ? offset : memWidth - width - offset; unsigned IsFast = 0; + assert((ptrOffset % 8) == 0 && "Non-Bytealigned pointer offset"); Align NewAlign = commonAlignment(Lod->getAlign(), ptrOffset / 8); - if (allowsMemoryAccess( + if (shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT, + ptrOffset / 8) && + allowsMemoryAccess( *DAG.getContext(), Layout, newVT, Lod->getAddressSpace(), NewAlign, Lod->getMemOperand()->getFlags(), &IsFast) && IsFast) { @@ -12175,17 +12176,17 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT, ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD; - if (!isOperationLegalOrCustom(ISD::LOAD, VecEltVT) || - !shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) + if (!isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) return SDValue(); + std::optional ByteOffset; Align Alignment = OriginalLoad->getAlign(); MachinePointerInfo MPI; if (auto *ConstEltNo = dyn_cast(EltNo)) { int Elt = ConstEltNo->getZExtValue(); - unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; - MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); - Alignment = commonAlignment(Alignment, PtrOff); + ByteOffset = VecEltVT.getSizeInBits() * Elt / 8; + MPI = OriginalLoad->getPointerInfo().getWithOffset(*ByteOffset); + Alignment = commonAlignment(Alignment, *ByteOffset); } else { // Discard the pointer info except the address space because the memory // operand can't represent this new access since the offset is variable. @@ -12193,6 +12194,9 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT, Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8); } + if (!shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT, ByteOffset)) + return SDValue(); + unsigned IsFast = 0; if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT, OriginalLoad->getAddressSpace(), Alignment, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 771eee1b3fecf..84d64bc835077 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16519,11 +16519,12 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return false; } -bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, - ISD::LoadExtType ExtTy, - EVT NewVT) const { +bool AArch64TargetLowering::shouldReduceLoadWidth( + SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset) const { // TODO: This may be worth removing. Check regression tests for diffs. - if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) + if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT, + ByteOffset)) return false; // If we're reducing the load width in order to avoid having to use an extra diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 0d51ef2be8631..8b5d2ec9e6ddf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -695,8 +695,8 @@ class AArch64TargetLowering : public TargetLowering { MachineFunction &MF, unsigned Intrinsic) const override; - bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, - EVT NewVT) const override; + bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset) const override; bool shouldRemoveRedundantExtend(SDValue Op) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 2846405a2538c..236c373e70250 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -819,11 +819,11 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); } -bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, - ISD::LoadExtType ExtTy, - EVT NewVT) const { +bool AMDGPUTargetLowering::shouldReduceLoadWidth( + SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset) const { // TODO: This may be worth removing. Check regression tests for diffs. - if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT)) + if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset)) return false; unsigned NewSize = NewVT.getStoreSizeInBits(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index fa9d61ec37c24..a42214865ccfd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -215,9 +215,8 @@ class AMDGPUTargetLowering : public TargetLowering { bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; bool ShouldShrinkFPConstant(EVT VT) const override; - bool shouldReduceLoadWidth(SDNode *Load, - ISD::LoadExtType ExtType, - EVT ExtVT) const override; + bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, + std::optional ByteOffset) const override; bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final; diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h index ad048ad05e6dd..8104895cb7f14 100644 --- a/llvm/lib/Target/BPF/BPFISelLowering.h +++ b/llvm/lib/Target/BPF/BPFISelLowering.h @@ -135,8 +135,9 @@ class BPFTargetLowering : public TargetLowering { // ctx = ctx + reloc_offset // ... (*(u8 *)(ctx + 1)) & 0x80 ... // which will be rejected by the verifier. - bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, - EVT NewVT) const override { + bool + shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset) const override { return false; } diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 4c479ac41be12..fe12f99b91cd3 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3821,19 +3821,21 @@ HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, return TargetLowering::findRepresentativeClass(TRI, VT); } -bool HexagonTargetLowering::shouldReduceLoadWidth(SDNode *Load, - ISD::LoadExtType ExtTy, EVT NewVT) const { +bool HexagonTargetLowering::shouldReduceLoadWidth( + SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset) const { // TODO: This may be worth removing. Check regression tests for diffs. - if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) + if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT, + ByteOffset)) return false; auto *L = cast(Load); - std::pair BO = getBaseAndOffset(L->getBasePtr()); + std::pair BO = getBaseAndOffset(L->getBasePtr()); // Small-data object, do not shrink. if (BO.first.getOpcode() == HexagonISD::CONST32_GP) return false; if (GlobalAddressSDNode *GA = dyn_cast(BO.first)) { - auto &HTM = static_cast(getTargetMachine()); + auto &HTM = static_cast(getTargetMachine()); const auto *GO = dyn_cast_or_null(GA->getGlobal()); return !GO || !HTM.getObjFileLowering()->isGlobalInSmallSection(GO, HTM); } diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 4df88b3a8abd7..1321bee44a295 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -342,8 +342,8 @@ class HexagonTargetLowering : public TargetLowering { SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const override; - bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, - EVT NewVT) const override; + bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset) const override; void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 993118c52564e..f624eab61d5b8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3258,9 +3258,9 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, return false; } -bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, - ISD::LoadExtType ExtTy, - EVT NewVT) const { +bool X86TargetLowering::shouldReduceLoadWidth( + SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset) const { assert(cast(Load)->isSimple() && "illegal to narrow"); // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 4a2b35e9efe7c..74d46bb260399 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1346,6 +1346,9 @@ namespace llvm { Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0); return Op.getOpcode() == X86ISD::VBROADCAST_LOAD || + Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD || + (Op.getOpcode() == ISD::LOAD && + getTargetConstantFromLoad(cast(Op))) || TargetLowering::isTargetCanonicalConstantNode(Op); } @@ -1501,8 +1504,9 @@ namespace llvm { /// Return true if we believe it is correct and profitable to reduce the /// load node to a smaller type. - bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, - EVT NewVT) const override; + bool + shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset) const override; /// Return true if the specified scalar FP type is computed in an SSE /// register, not on the X87 floating point stack. From bddc9f8812c15521573abb892413151351ae0b9e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 22 Apr 2025 17:58:46 +0100 Subject: [PATCH 2/4] Add comment describing ByteOffset param --- llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 818058c5bf8e3..657d8637d6811 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1823,6 +1823,8 @@ class TargetLoweringBase { virtual bool ShouldShrinkFPConstant(EVT) const { return true; } /// Return true if it is profitable to reduce a load to a smaller type. + /// \p ByteOffset is only set if we know the pointer offset at compile time + /// otherwise we should assume that additional pointer math is required. /// Example: (i16 (trunc (i32 (load x))) -> i16 load x /// Example: (i16 (trunc (srl (i32 (load x)), 16)) -> i16 load x+2 virtual bool shouldReduceLoadWidth( From b814b2a6a199b35d1aed1cfcb6e943e86fe40e6a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 23 Apr 2025 09:52:22 +0100 Subject: [PATCH 3/4] Remove isTargetCanonicalConstantNode change mistakenly brought in from #129695 --- llvm/lib/Target/X86/X86ISelLowering.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 74d46bb260399..7926292fc5bcf 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1346,9 +1346,6 @@ namespace llvm { Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0); return Op.getOpcode() == X86ISD::VBROADCAST_LOAD || - Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD || - (Op.getOpcode() == ISD::LOAD && - getTargetConstantFromLoad(cast(Op))) || TargetLowering::isTargetCanonicalConstantNode(Op); } From c08bfb81582c1407f3e30e187eb1a1ad4a309939 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 23 Apr 2025 10:04:34 +0100 Subject: [PATCH 4/4] Ensure TypeSize is fixed before computing ByteOffset --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 52eab8d6eec53..83b782fe103c8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -25265,10 +25265,12 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { // It's fine to use TypeSize here as we know the offset will not be negative. TypeSize Offset = VT.getStoreSize() * (Index / NumElts); + std::optional ByteOffset; + if (Offset.isFixed()) + ByteOffset = Offset.getFixedValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT, - /*ByteOffset=*/Offset.getFixedValue())) + if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT, ByteOffset)) return SDValue(); // The narrow load will be offset from the base address of the old load if