@@ -1108,47 +1108,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11081108 Function *F = getAssociatedFunction ();
11091109 auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
11101110
1111- auto TakeRange = [&](std::pair<unsigned , unsigned > R) {
1112- auto [Min, Max] = R;
1113- ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1114- IntegerRangeState RangeState (Range);
1115- clampStateAndIndicateChange (this ->getState (), RangeState);
1116- indicateOptimisticFixpoint ();
1117- };
1118-
1119- std::pair<unsigned , unsigned > MaxWavesPerEURange{
1120- 1U , InfoCache.getMaxWavesPerEU (*F)};
1121-
11221111 // If the attribute exists, we will honor it if it is not the default.
11231112 if (auto Attr = InfoCache.getWavesPerEUAttr (*F)) {
1113+ std::pair<unsigned , unsigned > MaxWavesPerEURange{
1114+ 1U , InfoCache.getMaxWavesPerEU (*F)};
11241115 if (*Attr != MaxWavesPerEURange) {
1125- TakeRange (*Attr);
1116+ auto [Min, Max] = *Attr;
1117+ ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1118+ IntegerRangeState RangeState (Range);
1119+ this ->getState () = RangeState;
1120+ indicateOptimisticFixpoint ();
11261121 return ;
11271122 }
11281123 }
11291124
1130- // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1131- // calculation of waves per EU involves flat work group size, we can't
1132- // simply use an assumed flat work group size as a start point, because the
1133- // update of flat work group size is in an inverse direction of waves per
1134- // EU. However, we can still do something if it is an entry function. Since
1135- // an entry function is a terminal node, and flat work group size either
1136- // from attribute or default will be used anyway, we can take that value and
1137- // calculate the waves per EU based on it. This result can't be updated by
1138- // no means, but that could still allow us to propagate it.
1139- if (AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1140- std::pair<unsigned , unsigned > FlatWorkGroupSize;
1141- if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr (*F))
1142- FlatWorkGroupSize = *Attr;
1143- else
1144- FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize (*F);
1145- TakeRange (InfoCache.getEffectiveWavesPerEU (*F, MaxWavesPerEURange,
1146- FlatWorkGroupSize));
1147- }
1125+ if (AMDGPU::isEntryFunctionCC (F->getCallingConv ()))
1126+ indicatePessimisticFixpoint ();
11481127 }
11491128
11501129 ChangeStatus updateImpl (Attributor &A) override {
1151- auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
11521130 ChangeStatus Change = ChangeStatus::UNCHANGED;
11531131
11541132 auto CheckCallSite = [&](AbstractCallSite CS) {
@@ -1157,24 +1135,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11571135 LLVM_DEBUG (dbgs () << ' [' << getName () << " ] Call " << Caller->getName ()
11581136 << " ->" << Func->getName () << ' \n ' );
11591137
1160- const auto *CallerInfo = A.getAAFor <AAAMDWavesPerEU>(
1138+ const auto *CallerAA = A.getAAFor <AAAMDWavesPerEU>(
11611139 *this , IRPosition::function (*Caller), DepClassTy::REQUIRED);
1162- const auto *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1163- *this , IRPosition::function (*Func), DepClassTy::REQUIRED);
1164- if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState () ||
1165- !AssumedGroupSize->isValidState ())
1140+ if (!CallerAA || !CallerAA->isValidState ())
11661141 return false ;
11671142
1168- unsigned Min, Max ;
1169- std::tie (Min, Max) = InfoCache. getEffectiveWavesPerEU (
1170- *Caller,
1171- {CallerInfo-> getAssumed (). getLower ().getZExtValue (),
1172- CallerInfo ->getAssumed ().getUpper ().getZExtValue () - 1 },
1173- {AssumedGroupSize-> getAssumed (). getLower (). getZExtValue (),
1174- AssumedGroupSize-> getAssumed (). getUpper (). getZExtValue () - 1 } );
1175- ConstantRange CallerRange ( APInt ( 32 , Min), APInt ( 32 , Max + 1 )) ;
1176- IntegerRangeState CallerRangeState (CallerRange);
1177- Change |= clampStateAndIndicateChange ( this -> getState (), CallerRangeState) ;
1143+ auto Assumed = this -> getAssumed () ;
1144+ unsigned Min = std::max (Assumed. getLower (). getZExtValue (),
1145+ CallerAA-> getAssumed (). getLower (). getZExtValue ());
1146+ unsigned Max = std::max (Assumed. getUpper ().getZExtValue (),
1147+ CallerAA ->getAssumed ().getUpper ().getZExtValue ());
1148+ ConstantRange Range ( APInt ( 32 , Min), APInt ( 32 , Max));
1149+ IntegerRangeState RangeState (Range );
1150+ this -> getState () = RangeState ;
1151+ Change |= this -> getState () == Assumed ? ChangeStatus::UNCHANGED
1152+ : ChangeStatus::CHANGED ;
11781153
11791154 return true ;
11801155 };
@@ -1333,6 +1308,59 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
13331308 }
13341309}
13351310
1311+ static void checkWavesPerEU (Module &M, TargetMachine &TM) {
1312+ for (Function &F : M) {
1313+ const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
1314+
1315+ auto FlatWgrpSizeAttr =
1316+ AMDGPU::getIntegerPairAttribute (F, " amdgpu-flat-work-group-size" );
1317+ auto WavesPerEUAttr = AMDGPU::getIntegerPairAttribute (
1318+ F, " amdgpu-waves-per-eu" , /* OnlyFirstRequired=*/ true );
1319+
1320+ unsigned MinWavesPerEU = ST.getMinWavesPerEU ();
1321+ unsigned MaxWavesPerEU = ST.getMaxWavesPerEU ();
1322+
1323+ unsigned MinFlatWgrpSize = 1U ;
1324+ unsigned MaxFlatWgrpSize = 1024U ;
1325+ if (FlatWgrpSizeAttr.has_value ()) {
1326+ MinFlatWgrpSize = FlatWgrpSizeAttr->first ;
1327+ MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second );
1328+ }
1329+
1330+ // Start with the max range.
1331+ unsigned Min = MinWavesPerEU;
1332+ unsigned Max = MaxWavesPerEU;
1333+
1334+ // If the attribute exists, set them to the value from the attribute.
1335+ if (WavesPerEUAttr.has_value ()) {
1336+ Min = WavesPerEUAttr->first ;
1337+ if (WavesPerEUAttr->second .has_value ())
1338+ Max = *(WavesPerEUAttr->second );
1339+ }
1340+
1341+ // Compute the range from flat workgroup size.
1342+ auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1343+ ST.getWavesPerEU (F, std::make_pair (MinFlatWgrpSize, MaxFlatWgrpSize));
1344+
1345+ // For the lower bound, we have to "tighten" it.
1346+ Min = std::max (Min, MinFromFlatWgrpSize);
1347+ // For the upper bound, we have to "extend" it.
1348+ Max = std::max (Max, MaxFromFlatWgrpSize);
1349+
1350+ // Clamp the range to the max range.
1351+ Min = std::max (Min, MinWavesPerEU);
1352+ Max = std::min (Max, MaxWavesPerEU);
1353+
1354+ // Update the attribute if it is not the max.
1355+ if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1356+ SmallString<10 > Buffer;
1357+ raw_svector_ostream OS (Buffer);
1358+ OS << Min << ' ,' << Max;
1359+ F.addFnAttr (" amdgpu-waves-per-eu" , OS.str ());
1360+ }
1361+ }
1362+ }
1363+
13361364static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
13371365 AMDGPUAttributorOptions Options,
13381366 ThinOrFullLTOPhase LTOPhase) {
@@ -1408,8 +1436,14 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14081436 }
14091437 }
14101438
1411- ChangeStatus Change = A.run ();
1412- return Change == ChangeStatus::CHANGED;
1439+ bool Changed = A.run () == ChangeStatus::CHANGED;
1440+
1441+ if (Changed && (LTOPhase == ThinOrFullLTOPhase::None ||
1442+ LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
1443+ LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink))
1444+ checkWavesPerEU (M, TM);
1445+
1446+ return Changed;
14131447}
14141448
14151449class AMDGPUAttributorLegacy : public ModulePass {
0 commit comments