@@ -1115,47 +1115,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11151115 Function *F = getAssociatedFunction ();
11161116 auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
11171117
1118- auto TakeRange = [&](std::pair<unsigned , unsigned > R) {
1119- auto [Min, Max] = R;
1120- ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1121- IntegerRangeState RangeState (Range);
1122- clampStateAndIndicateChange (this ->getState (), RangeState);
1123- indicateOptimisticFixpoint ();
1124- };
1125-
1126- std::pair<unsigned , unsigned > MaxWavesPerEURange{
1127- 1U , InfoCache.getMaxWavesPerEU (*F)};
1128-
11291118 // If the attribute exists, we will honor it if it is not the default.
11301119 if (auto Attr = InfoCache.getWavesPerEUAttr (*F)) {
1120+ std::pair<unsigned , unsigned > MaxWavesPerEURange{
1121+ 1U , InfoCache.getMaxWavesPerEU (*F)};
11311122 if (*Attr != MaxWavesPerEURange) {
1132- TakeRange (*Attr);
1123+ auto [Min, Max] = *Attr;
1124+ ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1125+ IntegerRangeState RangeState (Range);
1126+ this ->getState () = RangeState;
1127+ indicateOptimisticFixpoint ();
11331128 return ;
11341129 }
11351130 }
11361131
1137- // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1138- // calculation of waves per EU involves flat work group size, we can't
1139- // simply use an assumed flat work group size as a start point, because the
1140- // update of flat work group size is in an inverse direction of waves per
1141- // EU. However, we can still do something if it is an entry function. Since
1142- // an entry function is a terminal node, and flat work group size either
1143- // from attribute or default will be used anyway, we can take that value and
1144- // calculate the waves per EU based on it. This result can't be updated by
1145- // no means, but that could still allow us to propagate it.
1146- if (AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1147- std::pair<unsigned , unsigned > FlatWorkGroupSize;
1148- if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr (*F))
1149- FlatWorkGroupSize = *Attr;
1150- else
1151- FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize (*F);
1152- TakeRange (InfoCache.getEffectiveWavesPerEU (*F, MaxWavesPerEURange,
1153- FlatWorkGroupSize));
1154- }
1132+ if (AMDGPU::isEntryFunctionCC (F->getCallingConv ()))
1133+ indicatePessimisticFixpoint ();
11551134 }
11561135
11571136 ChangeStatus updateImpl (Attributor &A) override {
1158- auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
11591137 ChangeStatus Change = ChangeStatus::UNCHANGED;
11601138
11611139 auto CheckCallSite = [&](AbstractCallSite CS) {
@@ -1164,24 +1142,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11641142 LLVM_DEBUG (dbgs () << ' [' << getName () << " ] Call " << Caller->getName ()
11651143 << " ->" << Func->getName () << ' \n ' );
11661144
1167- const auto *CallerInfo = A.getAAFor <AAAMDWavesPerEU>(
1145+ const auto *CallerAA = A.getAAFor <AAAMDWavesPerEU>(
11681146 *this , IRPosition::function (*Caller), DepClassTy::REQUIRED);
1169- const auto *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1170- *this , IRPosition::function (*Func), DepClassTy::REQUIRED);
1171- if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState () ||
1172- !AssumedGroupSize->isValidState ())
1147+ if (!CallerAA || !CallerAA->isValidState ())
11731148 return false ;
11741149
1175- unsigned Min, Max ;
1176- std::tie (Min, Max) = InfoCache. getEffectiveWavesPerEU (
1177- *Caller,
1178- {CallerInfo-> getAssumed (). getLower ().getZExtValue (),
1179- CallerInfo ->getAssumed ().getUpper ().getZExtValue () - 1 },
1180- {AssumedGroupSize-> getAssumed (). getLower (). getZExtValue (),
1181- AssumedGroupSize-> getAssumed (). getUpper (). getZExtValue () - 1 } );
1182- ConstantRange CallerRange ( APInt ( 32 , Min), APInt ( 32 , Max + 1 )) ;
1183- IntegerRangeState CallerRangeState (CallerRange);
1184- Change |= clampStateAndIndicateChange ( this -> getState (), CallerRangeState) ;
1150+ ConstantRange Assumed = getAssumed () ;
1151+ unsigned Min = std::max (Assumed. getLower (). getZExtValue (),
1152+ CallerAA-> getAssumed (). getLower (). getZExtValue ());
1153+ unsigned Max = std::max (Assumed. getUpper ().getZExtValue (),
1154+ CallerAA ->getAssumed ().getUpper ().getZExtValue ());
1155+ ConstantRange Range ( APInt ( 32 , Min), APInt ( 32 , Max));
1156+ IntegerRangeState RangeState (Range );
1157+ getState () = RangeState ;
1158+ Change |= getState () == Assumed ? ChangeStatus::UNCHANGED
1159+ : ChangeStatus::CHANGED ;
11851160
11861161 return true ;
11871162 };
@@ -1320,6 +1295,74 @@ struct AAAMDGPUNoAGPR
13201295
13211296const char AAAMDGPUNoAGPR::ID = 0 ;
13221297
1298+ // / Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1299+ // / based on the finalized 'amdgpu-flat-work-group-size' attribute.
1300+ // / Both attributes start with narrow ranges that expand during iteration.
1301+ // / However, a narrower flat-workgroup-size leads to a wider waves-per-eu range,
1302+ // / preventing optimal updates later. Therefore, waves-per-eu can't be updated
1303+ // / with intermediate values during the attributor run. We defer the
1304+ // / finalization of waves-per-eu until after the flat-workgroup-size is
1305+ // / finalized.
1306+ // / TODO: Remove this and move similar logic back into the attributor run once
1307+ // / we have a better representation for waves-per-eu.
1308+ static bool updateWavesPerEU (Module &M, TargetMachine &TM) {
1309+ bool Changed = false ;
1310+
1311+ LLVMContext &Ctx = M.getContext ();
1312+
1313+ for (Function &F : M) {
1314+ if (F.isDeclaration ())
1315+ continue ;
1316+
1317+ const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
1318+
1319+ std::optional<std::pair<unsigned , std::optional<unsigned >>>
1320+ FlatWgrpSizeAttr =
1321+ AMDGPU::getIntegerPairAttribute (F, " amdgpu-flat-work-group-size" );
1322+
1323+ unsigned MinWavesPerEU = ST.getMinWavesPerEU ();
1324+ unsigned MaxWavesPerEU = ST.getMaxWavesPerEU ();
1325+
1326+ unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize ();
1327+ unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize ();
1328+ if (FlatWgrpSizeAttr.has_value ()) {
1329+ MinFlatWgrpSize = FlatWgrpSizeAttr->first ;
1330+ MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second );
1331+ }
1332+
1333+ // Start with the "best" range.
1334+ unsigned Min = MinWavesPerEU;
1335+ unsigned Max = MinWavesPerEU;
1336+
1337+ // Compute the range from flat workgroup size. `getWavesPerEU` will also
1338+ // account for the 'amdgpu-waves-er-eu' attribute.
1339+ auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1340+ ST.getWavesPerEU (F, {MinFlatWgrpSize, MaxFlatWgrpSize});
1341+
1342+ // For the lower bound, we have to "tighten" it.
1343+ Min = std::max (Min, MinFromFlatWgrpSize);
1344+ // For the upper bound, we have to "extend" it.
1345+ Max = std::max (Max, MaxFromFlatWgrpSize);
1346+
1347+ // Clamp the range to the max range.
1348+ Min = std::max (Min, MinWavesPerEU);
1349+ Max = std::min (Max, MaxWavesPerEU);
1350+
1351+ // Update the attribute if it is not the max.
1352+ if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1353+ SmallString<10 > Buffer;
1354+ raw_svector_ostream OS (Buffer);
1355+ OS << Min << ' ,' << Max;
1356+ Attribute OldAttr = F.getFnAttribute (" amdgpu-waves-per-eu" );
1357+ Attribute NewAttr = Attribute::get (Ctx, " amdgpu-waves-per-eu" , OS.str ());
1358+ F.addFnAttr (NewAttr);
1359+ Changed |= OldAttr == NewAttr;
1360+ }
1361+ }
1362+
1363+ return Changed;
1364+ }
1365+
13231366static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
13241367 AMDGPUAttributorOptions Options) {
13251368 SetVector<Function *> Functions;
@@ -1394,8 +1437,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13941437 }
13951438 }
13961439
1397- ChangeStatus Change = A.run ();
1398- return Change == ChangeStatus::CHANGED;
1440+ bool Changed = A.run () == ChangeStatus::CHANGED;
1441+
1442+ Changed |= updateWavesPerEU (M, TM);
1443+
1444+ return Changed;
13991445}
14001446
14011447class AMDGPUAttributorLegacy : public ModulePass {
0 commit comments