Skip to content

Commit d38fead

Browse files
authored
Merge branch 'main' into mlir-fix-eccc6e2
2 parents 43efb86 + a910a6a commit d38fead

33 files changed

+546
-169
lines changed

lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def test_readMemory(self):
127127
self.continue_to_exit()
128128

129129
# Flakey on 32-bit Arm Linux.
130-
@skipif(oslist=["linux"], archs=["arm$"])
130+
@skipIf(oslist=["linux"], archs=["arm$"])
131131
def test_writeMemory(self):
132132
"""
133133
Tests the 'writeMemory' request

lldb/test/Shell/SymbolFile/PDB/calling-conventions-arm.test

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
REQUIRES: target-windows, lld, (target-arm || target-aarch64)
2+
23
RUN: %build --compiler=clang-cl --arch=32 --nodefaultlib --output=%t.exe %S/Inputs/CallingConventionsTest.cpp
4+
RUN: lldb-test symbols -dump-ast %t.exe | FileCheck %s
5+
RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck %s
6+
37
RUN: %build --compiler=clang-cl --arch=64 --nodefaultlib --output=%t.exe %S/Inputs/CallingConventionsTest.cpp
48
RUN: lldb-test symbols -dump-ast %t.exe | FileCheck %s
9+
RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck %s
510

611
CHECK: Module: {{.*}}
712
CHECK-DAG: int (*FuncCCallPtr)();

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -878,9 +878,6 @@ class TargetTransformInfoImplBase {
878878
switch (ICA.getID()) {
879879
default:
880880
break;
881-
case Intrinsic::experimental_vector_histogram_add:
882-
// For now, we want explicit support from the target for histograms.
883-
return InstructionCost::getInvalid();
884881
case Intrinsic::allow_runtime_check:
885882
case Intrinsic::allow_ubsan_check:
886883
case Intrinsic::annotation:

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2105,6 +2105,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
21052105
}
21062106
case Intrinsic::get_active_lane_mask:
21072107
case Intrinsic::experimental_vector_match:
2108+
case Intrinsic::experimental_vector_histogram_add:
2109+
case Intrinsic::experimental_vector_histogram_uadd_sat:
2110+
case Intrinsic::experimental_vector_histogram_umax:
2111+
case Intrinsic::experimental_vector_histogram_umin:
21082112
return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
21092113
case Intrinsic::modf:
21102114
case Intrinsic::sincos:
@@ -2457,6 +2461,51 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
24572461
return thisT()->getShuffleCost(TTI::SK_Reverse, cast<VectorType>(RetTy),
24582462
cast<VectorType>(ICA.getArgTypes()[0]), {},
24592463
CostKind, 0, cast<VectorType>(RetTy));
2464+
case Intrinsic::experimental_vector_histogram_add:
2465+
case Intrinsic::experimental_vector_histogram_uadd_sat:
2466+
case Intrinsic::experimental_vector_histogram_umax:
2467+
case Intrinsic::experimental_vector_histogram_umin: {
2468+
FixedVectorType *PtrsTy = dyn_cast<FixedVectorType>(ICA.getArgTypes()[0]);
2469+
Type *EltTy = ICA.getArgTypes()[1];
2470+
2471+
// Targets with scalable vectors must handle this on their own.
2472+
if (!PtrsTy)
2473+
return InstructionCost::getInvalid();
2474+
2475+
Align Alignment = thisT()->DL.getABITypeAlign(EltTy);
2476+
InstructionCost Cost = 0;
2477+
Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, PtrsTy,
2478+
CostKind, 1, nullptr, nullptr);
2479+
Cost += thisT()->getMemoryOpCost(Instruction::Load, EltTy, Alignment, 0,
2480+
CostKind);
2481+
switch (IID) {
2482+
default:
2483+
llvm_unreachable("Unhandled histogram update operation.");
2484+
case Intrinsic::experimental_vector_histogram_add:
2485+
Cost +=
2486+
thisT()->getArithmeticInstrCost(Instruction::Add, EltTy, CostKind);
2487+
break;
2488+
case Intrinsic::experimental_vector_histogram_uadd_sat: {
2489+
IntrinsicCostAttributes UAddSat(Intrinsic::uadd_sat, EltTy, {EltTy});
2490+
Cost += thisT()->getIntrinsicInstrCost(UAddSat, CostKind);
2491+
break;
2492+
}
2493+
case Intrinsic::experimental_vector_histogram_umax: {
2494+
IntrinsicCostAttributes UMax(Intrinsic::umax, EltTy, {EltTy});
2495+
Cost += thisT()->getIntrinsicInstrCost(UMax, CostKind);
2496+
break;
2497+
}
2498+
case Intrinsic::experimental_vector_histogram_umin: {
2499+
IntrinsicCostAttributes UMin(Intrinsic::umin, EltTy, {EltTy});
2500+
Cost += thisT()->getIntrinsicInstrCost(UMin, CostKind);
2501+
break;
2502+
}
2503+
}
2504+
Cost += thisT()->getMemoryOpCost(Instruction::Store, EltTy, Alignment, 0,
2505+
CostKind);
2506+
Cost *= PtrsTy->getNumElements();
2507+
return Cost;
2508+
}
24602509
case Intrinsic::get_active_lane_mask: {
24612510
Type *ArgTy = ICA.getArgTypes()[0];
24622511
EVT ResVT = getTLI()->getValueType(DL, RetTy, true);

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,12 +1314,12 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13141314
if ((Name.starts_with("lifetime.start") ||
13151315
Name.starts_with("lifetime.end")) &&
13161316
F->arg_size() == 2) {
1317+
Intrinsic::ID IID = Name.starts_with("lifetime.start")
1318+
? Intrinsic::lifetime_start
1319+
: Intrinsic::lifetime_end;
13171320
rename(F);
1318-
NewFn = Intrinsic::getOrInsertDeclaration(
1319-
F->getParent(),
1320-
Name.starts_with("lifetime.start") ? Intrinsic::lifetime_start
1321-
: Intrinsic::lifetime_end,
1322-
F->getArg(0)->getType());
1321+
NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID,
1322+
F->getArg(0)->getType());
13231323
return true;
13241324
}
13251325
break;

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,17 @@ static bool isUnpackedVectorVT(EVT VecVT) {
554554
VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
555555
}
556556

557-
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
557+
static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
558+
const IntrinsicCostAttributes &ICA) {
559+
// We need to know at least the number of elements in the vector of buckets
560+
// and the size of each element to update.
561+
if (ICA.getArgTypes().size() < 2)
562+
return InstructionCost::getInvalid();
563+
564+
// Only interested in costing for the hardware instruction from SVE2.
565+
if (!ST->hasSVE2())
566+
return InstructionCost::getInvalid();
567+
558568
Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
559569
Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
560570
unsigned TotalHistCnts = 1;
@@ -579,9 +589,11 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
579589

580590
unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
581591
TotalHistCnts = EC / NaturalVectorWidth;
592+
593+
return InstructionCost(BaseHistCntCost * TotalHistCnts);
582594
}
583595

584-
return InstructionCost(BaseHistCntCost * TotalHistCnts);
596+
return InstructionCost::getInvalid();
585597
}
586598

587599
InstructionCost
@@ -597,10 +609,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
597609
return InstructionCost::getInvalid();
598610

599611
switch (ICA.getID()) {
600-
case Intrinsic::experimental_vector_histogram_add:
601-
if (!ST->hasSVE2())
602-
return InstructionCost::getInvalid();
603-
return getHistogramCost(ICA);
612+
case Intrinsic::experimental_vector_histogram_add: {
613+
InstructionCost HistCost = getHistogramCost(ST, ICA);
614+
// If the cost isn't valid, we may still be able to scalarize
615+
if (HistCost.isValid())
616+
return HistCost;
617+
break;
618+
}
604619
case Intrinsic::umin:
605620
case Intrinsic::umax:
606621
case Intrinsic::smin:

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 10 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -997,89 +997,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
997997
const Function &F = MF.getFunction();
998998

999999
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1000-
// dispatch registers are function args.
1001-
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
1002-
1003-
if (isShader(F.getCallingConv())) {
1004-
bool IsPixelShader =
1005-
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
1006-
1007-
// Calculate the number of VGPR registers based on the SPI input registers
1008-
uint32_t InputEna = 0;
1009-
uint32_t InputAddr = 0;
1010-
unsigned LastEna = 0;
1011-
1012-
if (IsPixelShader) {
1013-
// Note for IsPixelShader:
1014-
// By this stage, all enabled inputs are tagged in InputAddr as well.
1015-
// We will use InputAddr to determine whether the input counts against the
1016-
// vgpr total and only use the InputEnable to determine the last input
1017-
// that is relevant - if extra arguments are used, then we have to honour
1018-
// the InputAddr for any intermediate non-enabled inputs.
1019-
InputEna = MFI->getPSInputEnable();
1020-
InputAddr = MFI->getPSInputAddr();
1021-
1022-
// We only need to consider input args up to the last used arg.
1023-
assert((InputEna || InputAddr) &&
1024-
"PSInputAddr and PSInputEnable should "
1025-
"never both be 0 for AMDGPU_PS shaders");
1026-
// There are some rare circumstances where InputAddr is non-zero and
1027-
// InputEna can be set to 0. In this case we default to setting LastEna
1028-
// to 1.
1029-
LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
1030-
}
1000+
// dispatch registers as function args.
1001+
unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1002+
WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
10311003

1032-
// FIXME: We should be using the number of registers determined during
1033-
// calling convention lowering to legalize the types.
1034-
const DataLayout &DL = F.getDataLayout();
1035-
unsigned PSArgCount = 0;
1036-
unsigned IntermediateVGPR = 0;
1037-
for (auto &Arg : F.args()) {
1038-
unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
1039-
if (Arg.hasAttribute(Attribute::InReg)) {
1040-
WaveDispatchNumSGPR += NumRegs;
1041-
} else {
1042-
// If this is a PS shader and we're processing the PS Input args (first
1043-
// 16 VGPR), use the InputEna and InputAddr bits to define how many
1044-
// VGPRs are actually used.
1045-
// Any extra VGPR arguments are handled as normal arguments (and
1046-
// contribute to the VGPR count whether they're used or not).
1047-
if (IsPixelShader && PSArgCount < 16) {
1048-
if ((1 << PSArgCount) & InputAddr) {
1049-
if (PSArgCount < LastEna)
1050-
WaveDispatchNumVGPR += NumRegs;
1051-
else
1052-
IntermediateVGPR += NumRegs;
1053-
}
1054-
PSArgCount++;
1055-
} else {
1056-
// If there are extra arguments we have to include the allocation for
1057-
// the non-used (but enabled with InputAddr) input arguments
1058-
if (IntermediateVGPR) {
1059-
WaveDispatchNumVGPR += IntermediateVGPR;
1060-
IntermediateVGPR = 0;
1061-
}
1062-
WaveDispatchNumVGPR += NumRegs;
1063-
}
1064-
}
1065-
}
1004+
if (WaveDispatchNumSGPR) {
10661005
ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
1067-
{ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
1006+
{ProgInfo.NumSGPR,
1007+
MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1008+
Ctx)},
1009+
Ctx);
1010+
}
10681011

1012+
if (WaveDispatchNumVGPR) {
10691013
ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
10701014
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
10711015

10721016
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
10731017
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1074-
} else if (isKernel(F.getCallingConv()) &&
1075-
MFI->getNumKernargPreloadedSGPRs()) {
1076-
// Consider cases where the total number of UserSGPRs with trailing
1077-
// allocated preload SGPRs, is greater than the number of explicitly
1078-
// referenced SGPRs.
1079-
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
1080-
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
1081-
ProgInfo.NumSGPR =
1082-
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
10831018
}
10841019

10851020
// Adjust number of registers used to meet default/requested minimum/maximum

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
580580
++i;
581581
}
582582

583+
if (Info->getNumKernargPreloadedSGPRs())
584+
Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
585+
583586
TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
584587
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
585588
return true;
@@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(
743746
if (!determineAssignments(Assigner, SplitArgs, CCInfo))
744747
return false;
745748

749+
if (IsEntryFunc) {
750+
// This assumes the registers are allocated by CCInfo in ascending order
751+
// with no gaps.
752+
Info->setNumWaveDispatchSGPRs(
753+
CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
754+
Info->setNumWaveDispatchVGPRs(
755+
CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
756+
}
757+
746758
FormalArgHandler Handler(B, MRI);
747759
if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
748760
return false;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments(
31063106
if (!IsKernel) {
31073107
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
31083108
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3109+
3110+
// This assumes the registers are allocated by CCInfo in ascending order
3111+
// with no gaps.
3112+
Info->setNumWaveDispatchSGPRs(
3113+
CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3114+
Info->setNumWaveDispatchVGPRs(
3115+
CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3116+
} else if (Info->getNumKernargPreloadedSGPRs()) {
3117+
Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
31093118
}
31103119

31113120
SmallVector<SDValue, 16> Chains;

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
728728
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
729729
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
730730
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
731+
NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
732+
NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
731733
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
732734
Occupancy(MFI.getOccupancy()),
733735
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
@@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
784786
WaveLimiter = YamlMFI.WaveLimiter;
785787
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
786788
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
789+
NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
790+
NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
787791
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
788792
ReturnsVoid = YamlMFI.ReturnsVoid;
789793
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;

0 commit comments

Comments
 (0)