diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index aee9797585dbd..b6f20e6f99a0a 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -973,6 +973,34 @@ unsigned getBLXOpcode(const MachineFunction &MF); unsigned gettBLXrOpcode(const MachineFunction &MF); unsigned getBLXpredOpcode(const MachineFunction &MF); +inline bool isMVEVectorInstruction(const MachineInstr *MI) { + // This attempts to remove non-mve instructions (scalar shifts), which + // are just DPU CX instruction. + switch (MI->getOpcode()) { + case ARM::MVE_SQSHL: + case ARM::MVE_SRSHR: + case ARM::MVE_UQSHL: + case ARM::MVE_URSHR: + case ARM::MVE_SQRSHR: + case ARM::MVE_UQRSHL: + case ARM::MVE_ASRLr: + case ARM::MVE_ASRLi: + case ARM::MVE_LSLLr: + case ARM::MVE_LSLLi: + case ARM::MVE_LSRL: + case ARM::MVE_SQRSHRL: + case ARM::MVE_SQSHLL: + case ARM::MVE_SRSHRL: + case ARM::MVE_UQRSHLL: + case ARM::MVE_UQSHLL: + case ARM::MVE_URSHRL: + return false; + } + const MCInstrDesc &MCID = MI->getDesc(); + uint64_t Flags = MCID.TSFlags; + return (Flags & ARMII::DomainMask) == ARMII::DomainMVE; +} + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/llvm/lib/Target/ARM/ARMLatencyMutations.cpp b/llvm/lib/Target/ARM/ARMLatencyMutations.cpp new file mode 100644 index 0000000000000..3c86e8ab5892e --- /dev/null +++ b/llvm/lib/Target/ARM/ARMLatencyMutations.cpp @@ -0,0 +1,978 @@ +//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the ARM definition DAG scheduling mutations which +/// change inter-instruction latencies +// +//===----------------------------------------------------------------------===// + +#include "ARMLatencyMutations.h" +#include "ARMSubtarget.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include +#include +#include +#include +#include + +namespace llvm { + +namespace { + +// Precompute information about opcodes to speed up pass + +class InstructionInformation { +protected: + struct IInfo { + bool HasBRegAddr : 1; // B-side of addr gen is a register + bool HasBRegAddrShift : 1; // B-side of addr gen has a shift + bool IsDivide : 1; // Some form of integer divide + bool IsInlineShiftALU : 1; // Inline shift+ALU + bool IsMultiply : 1; // Some form of integer multiply + bool IsMVEIntMAC : 1; // MVE 8/16/32-bit integer MAC operation + bool IsNonSubwordLoad : 1; // Load which is a word or larger + bool IsShift : 1; // Shift operation + bool IsRev : 1; // REV operation + bool ProducesQP : 1; // Produces a vector register result + bool ProducesDP : 1; // Produces a double-precision register result + bool ProducesSP : 1; // Produces a single-precision register result + bool ConsumesQP : 1; // Consumes a vector register result + bool ConsumesDP : 1; // Consumes a double-precision register result + bool ConsumesSP : 1; // Consumes a single-precision register result + unsigned MVEIntMACMatched; // Matched operand type (for MVE) + unsigned AddressOpMask; // Mask indicating which operands go into AGU + IInfo() + : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false), + IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false), + IsNonSubwordLoad(false), IsShift(false), IsRev(false), + ProducesQP(false), ProducesDP(false), ProducesSP(false), + ConsumesQP(false), ConsumesDP(false), ConsumesSP(false), + MVEIntMACMatched(0), AddressOpMask(0) {} + }; + typedef std::array IInfoArray; + IInfoArray Info; + +public: + // Always available information + unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; } + bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; } + bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; } + bool isDivide(unsigned Op) { return Info[Op].IsDivide; } + bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; } + bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; } + bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; } + bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; } + bool isRev(unsigned Op) { return Info[Op].IsRev; } + bool isShift(unsigned Op) { return Info[Op].IsShift; } + + // information available if markDPConsumers is called. + bool producesQP(unsigned Op) { return Info[Op].ProducesQP; } + bool producesDP(unsigned Op) { return Info[Op].ProducesDP; } + bool producesSP(unsigned Op) { return Info[Op].ProducesSP; } + bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; } + bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; } + bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; } + + bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) { + return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp; + } + + InstructionInformation(const ARMBaseInstrInfo *TII); + +protected: + void markDPProducersConsumers(const ARMBaseInstrInfo *TII); +}; + +InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) { + using namespace ARM; + + std::initializer_list hasBRegAddrList = { + t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs, + tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr, + }; + for (auto op : hasBRegAddrList) { + Info[op].HasBRegAddr = true; + } + + std::initializer_list hasBRegAddrShiftList = { + t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs, + }; + for (auto op : hasBRegAddrShiftList) { + Info[op].HasBRegAddrShift = true; + } + + Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true; + + std::initializer_list isInlineShiftALUList = { + t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs, + t2ORNrs, t2RSBSrs, t2RSBrs, t2SBCrs, t2SUBrs, + t2SUBSrs, t2CMPrs, t2CMNzrs, t2TEQrs, t2TSTrs, + }; + for (auto op : isInlineShiftALUList) { + Info[op].IsInlineShiftALU = true; + } + + Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true; + + std::initializer_list isMultiplyList = { + t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX, + t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT, + t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX, + t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD, + t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT, + t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL, + }; + for (auto op : isMultiplyList) { + Info[op].IsMultiply = true; + } + + std::initializer_list isMVEIntMACList = { + MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8, + MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8, + MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8, + MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8, + MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8, + MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8, + MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8, + MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8, + MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8, + MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8, + MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8, + MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8, + MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8, + MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8, + }; + for (auto op : isMVEIntMACList) { + Info[op].IsMVEIntMAC = true; + } + + std::initializer_list isNonSubwordLoadList = { + t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci, + t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi, + tLDRpci, tLDRr, tLDRspi, + }; + for (auto op : isNonSubwordLoadList) { + Info[op].IsNonSubwordLoad = true; + } + + std::initializer_list isRevList = { + t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH, + }; + for (auto op : isRevList) { + Info[op].IsRev = true; + } + + std::initializer_list isShiftList = { + t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr, + tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR, + }; + for (auto op : isShiftList) { + Info[op].IsShift = true; + } + + std::initializer_list Address1List = { + t2LDRBi12, + t2LDRBi8, + t2LDRBpci, + t2LDRBs, + t2LDRHi12, + t2LDRHi8, + t2LDRHpci, + t2LDRHs, + t2LDRSBi12, + t2LDRSBi8, + t2LDRSBpci, + t2LDRSBs, + t2LDRSHi12, + t2LDRSHi8, + t2LDRSHpci, + t2LDRSHs, + t2LDRi12, + t2LDRi8, + t2LDRpci, + t2LDRs, + tLDRBi, + tLDRBr, + tLDRHi, + tLDRHr, + tLDRSB, + tLDRSH, + tLDRi, + tLDRpci, + tLDRr, + tLDRspi, + t2STRBi12, + t2STRBi8, + t2STRBs, + t2STRHi12, + t2STRHi8, + t2STRHs, + t2STRi12, + t2STRi8, + t2STRs, + tSTRBi, + tSTRBr, + tSTRHi, + tSTRHr, + tSTRi, + tSTRr, + tSTRspi, + VLDRD, + VLDRH, + VLDRS, + VSTRD, + VSTRH, + VSTRS, + MVE_VLD20_16, + MVE_VLD20_32, + MVE_VLD20_8, + MVE_VLD21_16, + MVE_VLD21_32, + MVE_VLD21_8, + MVE_VLD40_16, + MVE_VLD40_32, + MVE_VLD40_8, + MVE_VLD41_16, + MVE_VLD41_32, + MVE_VLD41_8, + MVE_VLD42_16, + MVE_VLD42_32, + MVE_VLD42_8, + MVE_VLD43_16, + MVE_VLD43_32, + MVE_VLD43_8, + MVE_VLDRBS16, + MVE_VLDRBS16_rq, + MVE_VLDRBS32, + MVE_VLDRBS32_rq, + MVE_VLDRBU16, + MVE_VLDRBU16_rq, + MVE_VLDRBU32, + MVE_VLDRBU32_rq, + MVE_VLDRBU8, + MVE_VLDRBU8_rq, + MVE_VLDRDU64_qi, + MVE_VLDRDU64_rq, + MVE_VLDRDU64_rq_u, + MVE_VLDRHS32, + MVE_VLDRHS32_rq, + MVE_VLDRHS32_rq_u, + MVE_VLDRHU16, + MVE_VLDRHU16_rq, + MVE_VLDRHU16_rq_u, + MVE_VLDRHU32, + MVE_VLDRHU32_rq, + MVE_VLDRHU32_rq_u, + MVE_VLDRWU32, + MVE_VLDRWU32_qi, + MVE_VLDRWU32_rq, + MVE_VLDRWU32_rq_u, + MVE_VST20_16, + MVE_VST20_32, + MVE_VST20_8, + MVE_VST21_16, + MVE_VST21_32, + MVE_VST21_8, + MVE_VST40_16, + MVE_VST40_32, + MVE_VST40_8, + MVE_VST41_16, + MVE_VST41_32, + MVE_VST41_8, + MVE_VST42_16, + MVE_VST42_32, + MVE_VST42_8, + MVE_VST43_16, + MVE_VST43_32, + MVE_VST43_8, + MVE_VSTRB16, + MVE_VSTRB16_rq, + MVE_VSTRB32, + MVE_VSTRB32_rq, + MVE_VSTRBU8, + MVE_VSTRB8_rq, + MVE_VSTRD64_qi, + MVE_VSTRD64_rq, + MVE_VSTRD64_rq_u, + MVE_VSTRH32, + MVE_VSTRH32_rq, + MVE_VSTRH32_rq_u, + MVE_VSTRHU16, + MVE_VSTRH16_rq, + MVE_VSTRH16_rq_u, + MVE_VSTRWU32, + MVE_VSTRW32_qi, + MVE_VSTRW32_rq, + MVE_VSTRW32_rq_u, + }; + std::initializer_list Address2List = { + t2LDRB_POST, + t2LDRB_PRE, + t2LDRDi8, + t2LDRH_POST, + t2LDRH_PRE, + t2LDRSB_POST, + t2LDRSB_PRE, + t2LDRSH_POST, + t2LDRSH_PRE, + t2LDR_POST, + t2LDR_PRE, + t2STRB_POST, + t2STRB_PRE, + t2STRDi8, + t2STRH_POST, + t2STRH_PRE, + t2STR_POST, + t2STR_PRE, + MVE_VLD20_16_wb, + MVE_VLD20_32_wb, + MVE_VLD20_8_wb, + MVE_VLD21_16_wb, + MVE_VLD21_32_wb, + MVE_VLD21_8_wb, + MVE_VLD40_16_wb, + MVE_VLD40_32_wb, + MVE_VLD40_8_wb, + MVE_VLD41_16_wb, + MVE_VLD41_32_wb, + MVE_VLD41_8_wb, + MVE_VLD42_16_wb, + MVE_VLD42_32_wb, + MVE_VLD42_8_wb, + MVE_VLD43_16_wb, + MVE_VLD43_32_wb, + MVE_VLD43_8_wb, + MVE_VLDRBS16_post, + MVE_VLDRBS16_pre, + MVE_VLDRBS32_post, + MVE_VLDRBS32_pre, + MVE_VLDRBU16_post, + MVE_VLDRBU16_pre, + MVE_VLDRBU32_post, + MVE_VLDRBU32_pre, + MVE_VLDRBU8_post, + MVE_VLDRBU8_pre, + MVE_VLDRDU64_qi_pre, + MVE_VLDRHS32_post, + MVE_VLDRHS32_pre, + MVE_VLDRHU16_post, + MVE_VLDRHU16_pre, + MVE_VLDRHU32_post, + MVE_VLDRHU32_pre, + MVE_VLDRWU32_post, + MVE_VLDRWU32_pre, + MVE_VLDRWU32_qi_pre, + MVE_VST20_16_wb, + MVE_VST20_32_wb, + MVE_VST20_8_wb, + MVE_VST21_16_wb, + MVE_VST21_32_wb, + MVE_VST21_8_wb, + MVE_VST40_16_wb, + MVE_VST40_32_wb, + MVE_VST40_8_wb, + MVE_VST41_16_wb, + MVE_VST41_32_wb, + MVE_VST41_8_wb, + MVE_VST42_16_wb, + MVE_VST42_32_wb, + MVE_VST42_8_wb, + MVE_VST43_16_wb, + MVE_VST43_32_wb, + MVE_VST43_8_wb, + MVE_VSTRB16_post, + MVE_VSTRB16_pre, + MVE_VSTRB32_post, + MVE_VSTRB32_pre, + MVE_VSTRBU8_post, + MVE_VSTRBU8_pre, + MVE_VSTRD64_qi_pre, + MVE_VSTRH32_post, + MVE_VSTRH32_pre, + MVE_VSTRHU16_post, + MVE_VSTRHU16_pre, + MVE_VSTRWU32_post, + MVE_VSTRWU32_pre, + MVE_VSTRW32_qi_pre, + }; + std::initializer_list Address3List = { + t2LDRD_POST, + t2LDRD_PRE, + t2STRD_POST, + t2STRD_PRE, + }; + // Compute a mask of which operands are involved in address computation + for (auto &op : Address1List) { + Info[op].AddressOpMask = 0x6; + } + for (auto &op : Address2List) { + Info[op].AddressOpMask = 0xc; + } + for (auto &op : Address3List) { + Info[op].AddressOpMask = 0x18; + } + for (auto &op : hasBRegAddrShiftList) { + Info[op].AddressOpMask |= 0x8; + } +} + +void InstructionInformation::markDPProducersConsumers( + const ARMBaseInstrInfo *TII) { + // Learn about all instructions which have FP source/dest registers + for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) { + const MCInstrDesc &MID = TII->get(MI); + auto Operands = MID.operands(); + for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) { + bool MarkQP = false, MarkDP = false, MarkSP = false; + switch (Operands[OI].RegClass) { + case ARM::MQPRRegClassID: + case ARM::DPRRegClassID: + case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + case ARM::DPairRegClassID: + case ARM::DPairSpcRegClassID: + case ARM::DQuadRegClassID: + case ARM::DQuadSpcRegClassID: + case ARM::DTripleRegClassID: + case ARM::DTripleSpcRegClassID: + MarkDP = true; + break; + case ARM::QPRRegClassID: + case ARM::QPR_8RegClassID: + case ARM::QPR_VFP2RegClassID: + case ARM::QQPRRegClassID: + case ARM::QQQQPRRegClassID: + MarkQP = true; + break; + case ARM::SPRRegClassID: + case ARM::SPR_8RegClassID: + case ARM::FPWithVPRRegClassID: + MarkSP = true; + break; + default: + break; + } + if (MarkQP) { + if (OI < MID.getNumDefs()) + Info[MI].ProducesQP = true; + else + Info[MI].ConsumesQP = true; + } + if (MarkDP) { + if (OI < MID.getNumDefs()) + Info[MI].ProducesDP = true; + else + Info[MI].ConsumesDP = true; + } + if (MarkSP) { + if (OI < MID.getNumDefs()) + Info[MI].ProducesSP = true; + else + Info[MI].ConsumesSP = true; + } + } + } +} + +} // anonymous namespace + +static bool hasImplicitCPSRUse(const MachineInstr *MI) { + return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR); +} + +void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep, + unsigned latency) { + SDep Reverse = SrcDep; + Reverse.setSUnit(&SrcSU); + for (SDep &PDep : SrcDep.getSUnit()->Preds) { + if (PDep == Reverse) { + PDep.setLatency(latency); + SrcDep.getSUnit()->setDepthDirty(); + break; + } + } + SrcDep.setLatency(latency); + SrcSU.setHeightDirty(); +} + +static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) { + return (a & 0xe) != (b & 0xe); +} + +// Set output dependences to zero latency for processors which can +// simultaneously issue to the same register. Returns true if a change +// was made. +bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) { + if (Dep.getKind() == SDep::Output) { + setBidirLatencies(ISU, Dep, 0); + return true; + } + return false; +} + +// The graph doesn't look inside of bundles to determine their +// scheduling boundaries and reports zero latency into and out of them +// (except for CPSR into the bundle, which has latency 1). +// Make some better scheduling assumptions: +// 1) CPSR uses have zero latency; other uses have incoming latency 1 +// 2) CPSR defs retain a latency of zero; others have a latency of 1. +// +// Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise +unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) { + + SUnit &DepSU = *Dep.getSUnit(); + const MachineInstr *SrcMI = ISU.getInstr(); + unsigned SrcOpcode = SrcMI->getOpcode(); + const MachineInstr *DstMI = DepSU.getInstr(); + unsigned DstOpcode = DstMI->getOpcode(); + + if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) { + setBidirLatencies( + ISU, Dep, + (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1); + return 1; + } + if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) && + Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) { + setBidirLatencies(ISU, Dep, 1); + return 2; + } + return 0; +} + +// Determine whether there is a memory RAW hazard here and set up latency +// accordingly +bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep, + unsigned latency) { + if (!Dep.isNormalMemory()) + return false; + auto &SrcInst = *ISU.getInstr(); + auto &DstInst = *Dep.getSUnit()->getInstr(); + if (!SrcInst.mayStore() || !DstInst.mayLoad()) + return false; + + auto SrcMO = *SrcInst.memoperands().begin(); + auto DstMO = *DstInst.memoperands().begin(); + auto SrcVal = SrcMO->getValue(); + auto DstVal = DstMO->getValue(); + auto SrcPseudoVal = SrcMO->getPseudoValue(); + auto DstPseudoVal = DstMO->getPseudoValue(); + if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias && + SrcMO->getOffset() == DstMO->getOffset()) { + setBidirLatencies(ISU, Dep, latency); + return true; + } else if (SrcPseudoVal && DstPseudoVal && + SrcPseudoVal->kind() == DstPseudoVal->kind() && + SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) { + // Spills/fills + auto FS0 = cast(SrcPseudoVal); + auto FS1 = cast(DstPseudoVal); + if (FS0 == FS1) { + setBidirLatencies(ISU, Dep, latency); + return true; + } + } + return false; +} + +namespace { + +std::unique_ptr II; + +class CortexM7InstructionInformation : public InstructionInformation { +public: + CortexM7InstructionInformation(const ARMBaseInstrInfo *TII) + : InstructionInformation(TII) {} +}; + +class CortexM7Overrides : public ARMOverrideBypasses { +public: + CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA) + : ARMOverrideBypasses(TII, AA) { + if (!II) + II.reset(new CortexM7InstructionInformation(TII)); + } + + void modifyBypasses(SUnit &) override; +}; + +void CortexM7Overrides::modifyBypasses(SUnit &ISU) { + const MachineInstr *SrcMI = ISU.getInstr(); + unsigned SrcOpcode = SrcMI->getOpcode(); + bool isNSWload = II->isNonSubwordLoad(SrcOpcode); + + // Walk the successors looking for latency overrides that are needed + for (SDep &Dep : ISU.Succs) { + + // Output dependences should have 0 latency, as M7 is able to + // schedule writers to the same register for simultaneous issue. + if (zeroOutputDependences(ISU, Dep)) + continue; + + if (memoryRAWHazard(ISU, Dep, 4)) + continue; + + // Ignore dependencies other than data + if (Dep.getKind() != SDep::Data) + continue; + + SUnit &DepSU = *Dep.getSUnit(); + if (DepSU.isBoundaryNode()) + continue; + + if (makeBundleAssumptions(ISU, Dep) == 1) + continue; + + const MachineInstr *DstMI = DepSU.getInstr(); + unsigned DstOpcode = DstMI->getOpcode(); + + // Word loads into any multiply or divide instruction are considered + // cannot bypass their scheduling stage. Didn't do this in the .td file + // because we cannot easily create a read advance that is 0 from certain + // writer classes and 1 from all the rest. + // (The other way around would have been easy.) + if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode))) + setBidirLatencies(ISU, Dep, Dep.getLatency() + 1); + + // Word loads into B operand of a load/store are considered cannot bypass + // their scheduling stage. Cannot do in the .td file because + // need to decide between -1 and -2 for ReadAdvance + if (isNSWload && II->hasBRegAddr(DstOpcode) && + DstMI->getOperand(2).getReg() == Dep.getReg()) + setBidirLatencies(ISU, Dep, Dep.getLatency() + 1); + + // Multiplies into any address generation cannot bypass from EX3. Cannot do + // in the .td file because need to decide between -1 and -2 for ReadAdvance + if (II->isMultiply(SrcOpcode)) { + unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1; + for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) { + if ((OpMask & 1) && DstMI->getOperand(i).isReg() && + DstMI->getOperand(i).getReg() == Dep.getReg()) { + setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1 + break; + } + } + } + + // Mismatched conditional producers take longer on M7; they end up looking + // like they were produced at EX3 and read at IS. + if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() && + (SrcOpcode == ARM::BUNDLE || + mismatchedPred(TII->getPredicate(*SrcMI), + TII->getPredicate(*DstMI)))) { + unsigned Lat = 1; + // Operand A of shift+ALU is treated as an EX1 read instead of EX2. + if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() && + DstMI->getOperand(1).getReg() == Dep.getReg()) + Lat = 2; + Lat = std::min(3u, Dep.getLatency() + Lat); + setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat)); + } + + // CC setter into conditional producer shouldn't have a latency of more + // than 1 unless it's due to an implicit read. (All the "true" readers + // of the condition code use an implicit read, and predicates use an + // explicit.) + if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR && + TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI)) + setBidirLatencies(ISU, Dep, 1); + + // REV instructions cannot bypass directly into the EX1 shifter. The + // code is slightly inexact as it doesn't attempt to ensure that the bypass + // is to the shifter operands. + if (II->isRev(SrcOpcode)) { + if (II->isInlineShiftALU(DstOpcode)) + setBidirLatencies(ISU, Dep, 2); + else if (II->isShift(DstOpcode)) + setBidirLatencies(ISU, Dep, 1); + } + } +} + +class M85InstructionInformation : public InstructionInformation { +public: + M85InstructionInformation(const ARMBaseInstrInfo *t) + : InstructionInformation(t) { + markDPProducersConsumers(t); + } +}; + +class M85Overrides : public ARMOverrideBypasses { +public: + M85Overrides(const ARMBaseInstrInfo *t, AAResults *a) + : ARMOverrideBypasses(t, a) { + if (!II) + II.reset(new M85InstructionInformation(t)); + } + + void modifyBypasses(SUnit &) override; + +private: + unsigned computeBypassStage(const MCSchedClassDesc *SCD); + signed modifyMixedWidthFP(const MachineInstr *SrcMI, + const MachineInstr *DstMI, unsigned RegID, + const MCSchedClassDesc *SCD); +}; + +unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) { + auto SM = DAG->getSchedModel(); + unsigned DefIdx = 0; // just look for the first output's timing + if (DefIdx < SCDesc->NumWriteLatencyEntries) { + // Lookup the definition's write latency in SubtargetInfo. + const MCWriteLatencyEntry *WLEntry = + SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx); + unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000; + if (Latency == 4) + return 2; + else if (Latency == 5) + return 3; + else if (Latency > 3) + return 3; + else + return Latency; + } + return 2; +} + +// Latency changes for bypassing between FP registers of different sizes: +// +// Note that mixed DP/SP are unlikely because of the semantics +// of C. Mixed MVE/SP are quite common when MVE intrinsics are used. +signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI, + const MachineInstr *DstMI, + unsigned RegID, + const MCSchedClassDesc *SCD) { + + if (!II->producesSP(SrcMI->getOpcode()) && + !II->producesDP(SrcMI->getOpcode()) && + !II->producesQP(SrcMI->getOpcode())) + return 0; + + if (Register::isVirtualRegister(RegID)) { + if (II->producesSP(SrcMI->getOpcode()) && + II->consumesDP(DstMI->getOpcode())) { + for (auto &OP : SrcMI->operands()) + if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && + OP.getSubReg() == ARM::ssub_1) + return 5 - computeBypassStage(SCD); + } else if (II->producesSP(SrcMI->getOpcode()) && + II->consumesQP(DstMI->getOpcode())) { + for (auto &OP : SrcMI->operands()) + if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && + (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3)) + return 5 - computeBypassStage(SCD) - + ((OP.getSubReg() == ARM::ssub_2 || + OP.getSubReg() == ARM::ssub_3) + ? 1 + : 0); + } else if (II->producesDP(SrcMI->getOpcode()) && + II->consumesQP(DstMI->getOpcode())) { + for (auto &OP : SrcMI->operands()) + if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && + OP.getSubReg() == ARM::ssub_1) + return -1; + } else if (II->producesDP(SrcMI->getOpcode()) && + II->consumesSP(DstMI->getOpcode())) { + for (auto &OP : DstMI->operands()) + if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && + OP.getSubReg() == ARM::ssub_1) + return 5 - computeBypassStage(SCD); + } else if (II->producesQP(SrcMI->getOpcode()) && + II->consumesSP(DstMI->getOpcode())) { + for (auto &OP : DstMI->operands()) + if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && + (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3)) + return 5 - computeBypassStage(SCD) + + ((OP.getSubReg() == ARM::ssub_2 || + OP.getSubReg() == ARM::ssub_3) + ? 1 + : 0); + } else if (II->producesQP(SrcMI->getOpcode()) && + II->consumesDP(DstMI->getOpcode())) { + for (auto &OP : DstMI->operands()) + if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && + OP.getSubReg() == ARM::ssub_1) + return 1; + } + } else if (Register::isPhysicalRegister(RegID)) { + // Note that when the producer is narrower, not all of the producers + // may be present in the scheduling graph; somewhere earlier in the + // compiler, an implicit def/use of the aliased full register gets + // added to the producer, and so only that producer is seen as *the* + // single producer. This behavior also has the unfortunate effect of + // serializing the producers in the compiler's view of things. + if (II->producesSP(SrcMI->getOpcode()) && + II->consumesDP(DstMI->getOpcode())) { + for (auto &OP : SrcMI->operands()) + if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 && + OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 && + (OP.getReg() == RegID || + (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID || + (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID)) + return 5 - computeBypassStage(SCD); + } else if (II->producesSP(SrcMI->getOpcode()) && + II->consumesQP(DstMI->getOpcode())) { + for (auto &OP : SrcMI->operands()) + if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 && + OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 && + (OP.getReg() == RegID || + (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID || + (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID)) + return 5 - computeBypassStage(SCD) - + (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0); + } else if (II->producesDP(SrcMI->getOpcode()) && + II->consumesQP(DstMI->getOpcode())) { + for (auto &OP : SrcMI->operands()) + if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 && + OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 && + (OP.getReg() == RegID || + (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID)) + return -1; + } else if (II->producesDP(SrcMI->getOpcode()) && + II->consumesSP(DstMI->getOpcode())) { + if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2) + return 5 - computeBypassStage(SCD); + } else if (II->producesQP(SrcMI->getOpcode()) && + II->consumesSP(DstMI->getOpcode())) { + if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2) + return 5 - computeBypassStage(SCD) + + (((RegID - ARM::S0) / 2) % 2 ? 1 : 0); + } else if (II->producesQP(SrcMI->getOpcode()) && + II->consumesDP(DstMI->getOpcode())) { + if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2) + return 1; + } + } + return 0; +} + +void M85Overrides::modifyBypasses(SUnit &ISU) { + const MachineInstr *SrcMI = ISU.getInstr(); + unsigned SrcOpcode = SrcMI->getOpcode(); + bool isNSWload = II->isNonSubwordLoad(SrcOpcode); + + // Walk the successors looking for latency overrides that are needed + for (SDep &Dep : ISU.Succs) { + + // Output dependences should have 0 latency, as CortexM85 is able to + // schedule writers to the same register for simultaneous issue. + if (zeroOutputDependences(ISU, Dep)) + continue; + + if (memoryRAWHazard(ISU, Dep, 3)) + continue; + + // Ignore dependencies other than data or strong ordering. + if (Dep.getKind() != SDep::Data) + continue; + + SUnit &DepSU = *Dep.getSUnit(); + if (DepSU.isBoundaryNode()) + continue; + + if (makeBundleAssumptions(ISU, Dep) == 1) + continue; + + const MachineInstr *DstMI = DepSU.getInstr(); + unsigned DstOpcode = DstMI->getOpcode(); + + // Word loads into B operand of a load/store with cannot bypass their + // scheduling stage. Cannot do in the .td file because need to decide + // between -1 and -2 for ReadAdvance + + if (isNSWload && II->hasBRegAddrShift(DstOpcode) && + DstMI->getOperand(3).getImm() != 0 && // shift operand + DstMI->getOperand(2).getReg() == Dep.getReg()) + setBidirLatencies(ISU, Dep, Dep.getLatency() + 1); + + if (isNSWload && isMVEVectorInstruction(DstMI)) { + setBidirLatencies(ISU, Dep, Dep.getLatency() + 1); + } + + if (II->isMVEIntMAC(DstOpcode) && + II->isMVEIntMACMatched(SrcOpcode, DstOpcode) && + DstMI->getOperand(0).isReg() && + DstMI->getOperand(0).getReg() == Dep.getReg()) + setBidirLatencies(ISU, Dep, Dep.getLatency() - 1); + + // CC setter into conditional producer shouldn't have a latency of more + // than 0 unless it's due to an implicit read. + if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR && + TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI)) + setBidirLatencies(ISU, Dep, 0); + + if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(), + DAG->getSchedClass(&ISU))) + setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat)); + + if (II->isRev(SrcOpcode)) { + if (II->isInlineShiftALU(DstOpcode)) + setBidirLatencies(ISU, Dep, 1); + else if (II->isShift(DstOpcode)) + setBidirLatencies(ISU, Dep, 1); + } + } +} + +// Add M55 specific overrides for latencies between instructions. Currently it: +// - Adds an extra cycle latency between MVE VMLAV and scalar instructions. +class CortexM55Overrides : public ARMOverrideBypasses { +public: + CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA) + : ARMOverrideBypasses(TII, AA) {} + + void modifyBypasses(SUnit &SU) override { + MachineInstr *SrcMI = SU.getInstr(); + if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction)) + return; + + for (SDep &Dep : SU.Succs) { + if (Dep.getKind() != SDep::Data) + continue; + SUnit &DepSU = *Dep.getSUnit(); + if (DepSU.isBoundaryNode()) + continue; + MachineInstr *DstMI = DepSU.getInstr(); + + if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore()) + setBidirLatencies(SU, Dep, 3); + } + } +}; + +} // end anonymous namespace + +void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) { + DAG = DAGInstrs; + for (SUnit &ISU : DAGInstrs->SUnits) { + if (ISU.isBoundaryNode()) + continue; + modifyBypasses(ISU); + } + if (DAGInstrs->ExitSU.getInstr()) + modifyBypasses(DAGInstrs->ExitSU); +} + +std::unique_ptr +createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) { + if (ST.isCortexM85()) + return std::make_unique(ST.getInstrInfo(), AA); + else if (ST.isCortexM7()) + return std::make_unique(ST.getInstrInfo(), AA); + else if (ST.isCortexM55()) + return std::make_unique(ST.getInstrInfo(), AA); + + return nullptr; +} + +} // end namespace llvm diff --git a/llvm/lib/Target/ARM/ARMLatencyMutations.h b/llvm/lib/Target/ARM/ARMLatencyMutations.h new file mode 100644 index 0000000000000..a4b8de0be51f7 --- /dev/null +++ b/llvm/lib/Target/ARM/ARMLatencyMutations.h @@ -0,0 +1,56 @@ +//===- ARMLatencyMutations.h - ARM Latency Mutations ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the ARM definition DAG scheduling mutations which +/// change inter-instruction latencies +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_LATENCYMUTATIONS_H +#define LLVM_LIB_TARGET_ARM_LATENCYMUTATIONS_H + +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" + +namespace llvm { + +class AAResults; +class ARMBaseInstrInfo; + +/// Post-process the DAG to create cluster edges between instrs that may +/// be fused by the processor into a single operation. +class ARMOverrideBypasses : public ScheduleDAGMutation { +public: + ARMOverrideBypasses(const ARMBaseInstrInfo *t, AAResults *a) + : ScheduleDAGMutation(), TII(t), AA(a) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; + +private: + virtual void modifyBypasses(SUnit &) = 0; + +protected: + const ARMBaseInstrInfo *TII; + AAResults *AA; + ScheduleDAGInstrs *DAG = nullptr; + + static void setBidirLatencies(SUnit &SrcSU, SDep &SrcDep, unsigned latency); + static bool zeroOutputDependences(SUnit &ISU, SDep &Dep); + unsigned makeBundleAssumptions(SUnit &ISU, SDep &Dep); + bool memoryRAWHazard(SUnit &ISU, SDep &Dep, unsigned latency); +}; + +/// Note that you have to add: +/// DAG.addMutation(createARMLatencyMutation(ST, AA)); +/// to ARMPassConfig::createMachineScheduler() to have an effect. +std::unique_ptr +createARMLatencyMutations(const class ARMSubtarget &, AAResults *AA); + +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td index b94a5fc161469..22e7e6893c1a8 100644 --- a/llvm/lib/Target/ARM/ARMProcessors.td +++ b/llvm/lib/Target/ARM/ARMProcessors.td @@ -95,8 +95,12 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus", def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3", "Cortex-M3 ARM processors", []>; +def ProcM55 : SubtargetFeature<"m55", "ARMProcFamily", "CortexM55", + "Cortex-M55 ARM processors", []>; def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7", "Cortex-M7 ARM processors", []>; +def ProcM85 : SubtargetFeature<"m85", "ARMProcFamily", "CortexM85", + "Cortex-M85 ARM processors", []>; //===----------------------------------------------------------------------===// // ARM processors @@ -384,6 +388,7 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline, FeatureFixCMSE_CVE_2021_35465]>; def : ProcessorModel<"cortex-m55", CortexM55Model, [ARMv81mMainline, + ProcM55, FeatureDSP, FeatureFPARMv8_D16, FeatureUseMISched, @@ -394,6 +399,7 @@ def : ProcessorModel<"cortex-m55", CortexM55Model, [ARMv81mMainline, FeatureFixCMSE_CVE_2021_35465]>; def : ProcessorModel<"cortex-m85", CortexM85Model, [ARMv81mMainline, + ProcM85, FeatureDSP, FeatureFPARMv8_D16, FeaturePACBTI, diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 2f7af05a259f8..611eeac9ef712 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -291,7 +291,9 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool isCortexA15() const { return ARMProcFamily == CortexA15; } bool isSwift() const { return ARMProcFamily == Swift; } bool isCortexM3() const { return ARMProcFamily == CortexM3; } + bool isCortexM55() const { return ARMProcFamily == CortexM55; } bool isCortexM7() const { return ARMProcFamily == CortexM7; } + bool isCortexM85() const { return ARMProcFamily == CortexM85; } bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); } bool isCortexR5() const { return ARMProcFamily == CortexR5; } bool isKrait() const { return ARMProcFamily == Krait; } diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index a58c63dcf762d..7d4c6c3f9ebee 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -11,6 +11,7 @@ #include "ARMTargetMachine.h" #include "ARM.h" +#include "ARMLatencyMutations.h" #include "ARMMachineFunctionInfo.h" #include "ARMMacroFusion.h" #include "ARMSubtarget.h" @@ -371,6 +372,8 @@ class ARMPassConfig : public TargetPassConfig { const ARMSubtarget &ST = C->MF->getSubtarget(); if (ST.hasFusion()) DAG->addMutation(createARMMacroFusionDAGMutation()); + if (auto Mutation = createARMLatencyMutations(ST, C->AA)) + DAG->addMutation(std::move(Mutation)); return DAG; } diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt index 3d6af28b43753..a39629bd8aeb0 100644 --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -62,6 +62,7 @@ add_llvm_target(ARMCodeGen MVETailPredication.cpp MVEVPTBlockPass.cpp MVETPAndVPTOptimisationsPass.cpp + ARMLatencyMutations.cpp Thumb1FrameLowering.cpp Thumb1InstrInfo.cpp ThumbRegisterInfo.cpp