1414#include " GCNSubtarget.h"
1515#include " Utils/AMDGPUBaseInfo.h"
1616#include " llvm/Analysis/CycleAnalysis.h"
17+ #include " llvm/Analysis/TargetTransformInfo.h"
18+ #include " llvm/Analysis/UniformityAnalysis.h"
1719#include " llvm/CodeGen/TargetPassConfig.h"
20+ #include " llvm/IR/IRBuilder.h"
1821#include " llvm/IR/IntrinsicsAMDGPU.h"
1922#include " llvm/IR/IntrinsicsR600.h"
2023#include " llvm/InitializePasses.h"
@@ -1299,6 +1302,130 @@ struct AAAMDGPUNoAGPR
12991302
13001303const char AAAMDGPUNoAGPR::ID = 0 ;
13011304
1305+ struct AAAMDGPUUniform : public StateWrapper <BooleanState, AbstractAttribute> {
1306+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
1307+ AAAMDGPUUniform (const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1308+
1309+ // / Create an abstract attribute view for the position \p IRP.
1310+ static AAAMDGPUUniform &createForPosition (const IRPosition &IRP,
1311+ Attributor &A);
1312+
1313+ // / See AbstractAttribute::getName()
1314+ const std::string getName () const override { return " AAAMDGPUUniform" ; }
1315+
1316+ const std::string getAsStr (Attributor *A) const override {
1317+ return getAssumed () ? " inreg" : " non-inreg" ;
1318+ }
1319+
1320+ void trackStatistics () const override {}
1321+
1322+ // / See AbstractAttribute::getIdAddr()
1323+ const char *getIdAddr () const override { return &ID; }
1324+
1325+ // / This function should return true if the type of the \p AA is
1326+ // / AAAMDGPUUniform
1327+ static bool classof (const AbstractAttribute *AA) {
1328+ return (AA->getIdAddr () == &ID);
1329+ }
1330+
1331+ // / Unique ID (due to the unique address)
1332+ static const char ID;
1333+ };
1334+
1335+ const char AAAMDGPUUniform::ID = 0 ;
1336+
1337+ struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1338+ AAAMDGPUUniformArgument (const IRPosition &IRP, Attributor &A)
1339+ : AAAMDGPUUniform(IRP, A) {}
1340+
1341+ void initialize (Attributor &A) override {
1342+ Argument *Arg = getAssociatedArgument ();
1343+ CallingConv::ID CC = Arg->getParent ()->getCallingConv ();
1344+ if (Arg->hasAttribute (Attribute::InReg)) {
1345+ indicateOptimisticFixpoint ();
1346+ return ;
1347+ }
1348+ if (AMDGPU::isEntryFunctionCC (CC)) {
1349+ // We only use isArgPassedInSGPR on kernel entry function argument, so the
1350+ // potential i1 argument change will not affect this.
1351+ if (AMDGPU::isArgPassedInSGPR (Arg))
1352+ indicateOptimisticFixpoint ();
1353+ else
1354+ indicatePessimisticFixpoint ();
1355+ }
1356+ }
1357+
1358+ ChangeStatus updateImpl (Attributor &A) override {
1359+ unsigned ArgNo = getAssociatedArgument ()->getArgNo ();
1360+
1361+ auto isUniform = [&](AbstractCallSite ACS) -> bool {
1362+ CallBase *CB = ACS.getInstruction ();
1363+ Value *V = CB->getArgOperandUse (ArgNo);
1364+ if (isa<Constant>(V))
1365+ return true ;
1366+ Function *F = nullptr ;
1367+ if (auto *Arg = dyn_cast<Argument>(V)) {
1368+ auto *AA =
1369+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (*Arg));
1370+ if (AA)
1371+ return AA->isValidState ();
1372+ F = Arg->getParent ();
1373+ } else if (auto *I = dyn_cast<Instruction>(V)) {
1374+ F = I->getFunction ();
1375+ }
1376+
1377+ if (F) {
1378+ auto *UA =
1379+ A.getInfoCache ()
1380+ .getAnalysisResultForFunction <UniformityInfoAnalysis>(*F);
1381+ return UA && UA->isUniform (V);
1382+ }
1383+
1384+ // What else can it be here?
1385+ return false ;
1386+ };
1387+
1388+ bool UsedAssumedInformation = true ;
1389+ if (!A.checkForAllCallSites (isUniform, *this , /* RequireAllCallSites=*/ true ,
1390+ UsedAssumedInformation))
1391+ return indicatePessimisticFixpoint ();
1392+
1393+ if (!UsedAssumedInformation)
1394+ return indicateOptimisticFixpoint ();
1395+
1396+ return ChangeStatus::UNCHANGED;
1397+ }
1398+
1399+ ChangeStatus manifest (Attributor &A) override {
1400+ Argument *Arg = getAssociatedArgument ();
1401+ if (AMDGPU::isEntryFunctionCC (Arg->getParent ()->getCallingConv ()))
1402+ return ChangeStatus::UNCHANGED;
1403+ // If the argument already has inreg attribute, we will not do anything
1404+ // about it.
1405+ if (Arg->hasAttribute (Attribute::InReg))
1406+ return ChangeStatus::UNCHANGED;
1407+ // Add both inreg and "uniform" attribute to the argument. We will emit a
1408+ // readfirstlane at each call site for inreg uniform argument, and the
1409+ // "uniform" attribute will be removed later.
1410+ LLVMContext &Ctx = Arg->getContext ();
1411+ return A.manifestAttrs (getIRPosition (),
1412+ {Attribute::get (Ctx, Attribute::InReg),
1413+ Attribute::get (Ctx, " uniform" )});
1414+ }
1415+ };
1416+
1417+ AAAMDGPUUniform &AAAMDGPUUniform::createForPosition (const IRPosition &IRP,
1418+ Attributor &A) {
1419+ switch (IRP.getPositionKind ()) {
1420+ case IRPosition::IRP_ARGUMENT:
1421+ return *new (A.Allocator ) AAAMDGPUUniformArgument (IRP, A);
1422+ // TODO: Since inreg is also allowed for return value, maybe we need to add
1423+ // AAAMDGPUUniformCallSiteReturned?
1424+ default :
1425+ llvm_unreachable (" not a valid position for AAAMDGPUUniform" );
1426+ }
1427+ }
1428+
13021429// / Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
13031430// / based on the finalized 'amdgpu-flat-work-group-size' attribute.
13041431// / Both attributes start with narrow ranges that expand during iteration.
@@ -1367,6 +1494,59 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
13671494 return Changed;
13681495}
13691496
1497+ // / Emit the readfirstlane intrinsic for all inreg uniform function arguments at
1498+ // / each call site. The inreg uniform attribute combination is set by
1499+ // / AAAMDGPUUniform. This function provides a workaround for a downstream issue
1500+ // / where failing to emit a waterfall loop for 'inreg' arguments may result in
1501+ // / an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
1502+ // / loop for inreg uniform arguments here, because the 'inreg' attribute set by
1503+ // / AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
1504+ // / appropriate.
1505+ static bool emitReadFirstLaneForInregUniformArgs (Module &M) {
1506+ std::vector<std::pair<CallBase *, unsigned >> WorkList;
1507+
1508+ for (Function &F : M) {
1509+ if (F.isDeclaration ())
1510+ continue ;
1511+ for (Argument &Arg : F.args ()) {
1512+ if (!Arg.hasAttribute (Attribute::InReg) || !Arg.hasAttribute (" uniform" ))
1513+ continue ;
1514+ unsigned ArgNo = Arg.getArgNo ();
1515+ for (Use &U : F.uses ()) {
1516+ auto *CB = dyn_cast<CallBase>(U.getUser ());
1517+ if (!CB)
1518+ continue ;
1519+ // We will skip the call site argument when itself is an inreg argument.
1520+ // In this case, it will already be in SGPR.
1521+ if (auto *CSArg = dyn_cast<Argument>(CB->getArgOperand (ArgNo))) {
1522+ if (CSArg->hasAttribute (Attribute::InReg))
1523+ continue ;
1524+ }
1525+ WorkList.emplace_back (CB, ArgNo);
1526+ }
1527+ // We don't count this as changed since it just stays within this pass.
1528+ Arg.removeAttr (" uniform" );
1529+ }
1530+ }
1531+
1532+ if (WorkList.empty ())
1533+ return false ;
1534+
1535+ for (auto &[CB, ArgNo] : WorkList) {
1536+ Value *V = CB->getArgOperand (ArgNo);
1537+ IRBuilder<> Builder (CB);
1538+ Value *NewV = Builder.CreateIntrinsic (V->getType (),
1539+ Intrinsic::amdgcn_readfirstlane, {V});
1540+ CB->setArgOperand (ArgNo, NewV);
1541+ if (auto *I = dyn_cast<Instruction>(V)) {
1542+ if (I->use_empty ())
1543+ I->eraseFromParent ();
1544+ }
1545+ }
1546+
1547+ return true ;
1548+ }
1549+
13701550static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
13711551 AMDGPUAttributorOptions Options,
13721552 ThinOrFullLTOPhase LTOPhase) {
@@ -1385,7 +1565,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13851565 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
13861566 &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
13871567 &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1388- &AAInstanceInfo::ID});
1568+ &AAInstanceInfo::ID, &AAAMDGPUUniform::ID });
13891569
13901570 AttributorConfig AC (CGUpdater);
13911571 AC.IsClosedWorldModule = Options.IsClosedWorld ;
@@ -1438,11 +1618,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14381618 IRPosition::value (*CmpX->getPointerOperand ()));
14391619 }
14401620 }
1621+
1622+ if (!AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1623+ for (auto &Arg : F->args ())
1624+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (Arg));
1625+ }
14411626 }
14421627
14431628 bool Changed = A.run () == ChangeStatus::CHANGED;
14441629
14451630 Changed |= updateWavesPerEU (M, TM);
1631+ Changed |= emitReadFirstLaneForInregUniformArgs (M);
14461632
14471633 return Changed;
14481634}
@@ -1470,6 +1656,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
14701656
14711657 void getAnalysisUsage (AnalysisUsage &AU) const override {
14721658 AU.addRequired <CycleInfoWrapperPass>();
1659+ AU.addRequired <UniformityInfoWrapperPass>();
14731660 }
14741661
14751662 StringRef getPassName () const override { return " AMDGPU Attributor" ; }
0 commit comments