@@ -1109,47 +1109,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11091109 Function *F = getAssociatedFunction ();
11101110 auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
11111111
1112- auto TakeRange = [&](std::pair<unsigned , unsigned > R) {
1113- auto [Min, Max] = R;
1114- ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1115- IntegerRangeState RangeState (Range);
1116- clampStateAndIndicateChange (this ->getState (), RangeState);
1117- indicateOptimisticFixpoint ();
1118- };
1119-
1120- std::pair<unsigned , unsigned > MaxWavesPerEURange{
1121- 1U , InfoCache.getMaxWavesPerEU (*F)};
1122-
11231112 // If the attribute exists, we will honor it if it is not the default.
11241113 if (auto Attr = InfoCache.getWavesPerEUAttr (*F)) {
1114+ std::pair<unsigned , unsigned > MaxWavesPerEURange{
1115+ 1U , InfoCache.getMaxWavesPerEU (*F)};
11251116 if (*Attr != MaxWavesPerEURange) {
1126- TakeRange (*Attr);
1117+ auto [Min, Max] = *Attr;
1118+ ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1119+ IntegerRangeState RangeState (Range);
1120+ this ->getState () = RangeState;
1121+ indicateOptimisticFixpoint ();
11271122 return ;
11281123 }
11291124 }
11301125
1131- // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1132- // calculation of waves per EU involves flat work group size, we can't
1133- // simply use an assumed flat work group size as a start point, because the
1134- // update of flat work group size is in an inverse direction of waves per
1135- // EU. However, we can still do something if it is an entry function. Since
1136- // an entry function is a terminal node, and flat work group size either
1137- // from attribute or default will be used anyway, we can take that value and
1138- // calculate the waves per EU based on it. This result can't be updated by
1139- // no means, but that could still allow us to propagate it.
1140- if (AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1141- std::pair<unsigned , unsigned > FlatWorkGroupSize;
1142- if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr (*F))
1143- FlatWorkGroupSize = *Attr;
1144- else
1145- FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize (*F);
1146- TakeRange (InfoCache.getEffectiveWavesPerEU (*F, MaxWavesPerEURange,
1147- FlatWorkGroupSize));
1148- }
1126+ if (AMDGPU::isEntryFunctionCC (F->getCallingConv ()))
1127+ indicatePessimisticFixpoint ();
11491128 }
11501129
11511130 ChangeStatus updateImpl (Attributor &A) override {
1152- auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
11531131 ChangeStatus Change = ChangeStatus::UNCHANGED;
11541132
11551133 auto CheckCallSite = [&](AbstractCallSite CS) {
@@ -1158,24 +1136,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11581136 LLVM_DEBUG (dbgs () << ' [' << getName () << " ] Call " << Caller->getName ()
11591137 << " ->" << Func->getName () << ' \n ' );
11601138
1161- const auto *CallerInfo = A.getAAFor <AAAMDWavesPerEU>(
1139+ const auto *CallerAA = A.getAAFor <AAAMDWavesPerEU>(
11621140 *this , IRPosition::function (*Caller), DepClassTy::REQUIRED);
1163- const auto *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1164- *this , IRPosition::function (*Func), DepClassTy::REQUIRED);
1165- if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState () ||
1166- !AssumedGroupSize->isValidState ())
1141+ if (!CallerAA || !CallerAA->isValidState ())
11671142 return false ;
11681143
1169- unsigned Min, Max ;
1170- std::tie (Min, Max) = InfoCache. getEffectiveWavesPerEU (
1171- *Caller,
1172- {CallerInfo-> getAssumed (). getLower ().getZExtValue (),
1173- CallerInfo ->getAssumed ().getUpper ().getZExtValue () - 1 },
1174- {AssumedGroupSize-> getAssumed (). getLower (). getZExtValue (),
1175- AssumedGroupSize-> getAssumed (). getUpper (). getZExtValue () - 1 } );
1176- ConstantRange CallerRange ( APInt ( 32 , Min), APInt ( 32 , Max + 1 )) ;
1177- IntegerRangeState CallerRangeState (CallerRange);
1178- Change |= clampStateAndIndicateChange ( this -> getState (), CallerRangeState) ;
1144+ auto Assumed = this -> getAssumed () ;
1145+ unsigned Min = std::max (Assumed. getLower (). getZExtValue (),
1146+ CallerAA-> getAssumed (). getLower (). getZExtValue ());
1147+ unsigned Max = std::max (Assumed. getUpper ().getZExtValue (),
1148+ CallerAA ->getAssumed ().getUpper ().getZExtValue ());
1149+ ConstantRange Range ( APInt ( 32 , Min), APInt ( 32 , Max));
1150+ IntegerRangeState RangeState (Range );
1151+ this -> getState () = RangeState ;
1152+ Change |= this -> getState () == Assumed ? ChangeStatus::UNCHANGED
1153+ : ChangeStatus::CHANGED ;
11791154
11801155 return true ;
11811156 };
@@ -1329,6 +1304,59 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
13291304 }
13301305}
13311306
1307+ static void checkWavesPerEU (Module &M, TargetMachine &TM) {
1308+ for (Function &F : M) {
1309+ const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
1310+
1311+ auto FlatWgrpSizeAttr =
1312+ AMDGPU::getIntegerPairAttribute (F, " amdgpu-flat-work-group-size" );
1313+ auto WavesPerEUAttr = AMDGPU::getIntegerPairAttribute (
1314+ F, " amdgpu-waves-per-eu" , /* OnlyFirstRequired=*/ true );
1315+
1316+ unsigned MinWavesPerEU = ST.getMinWavesPerEU ();
1317+ unsigned MaxWavesPerEU = ST.getMaxWavesPerEU ();
1318+
1319+ unsigned MinFlatWgrpSize = 1U ;
1320+ unsigned MaxFlatWgrpSize = 1024U ;
1321+ if (FlatWgrpSizeAttr.has_value ()) {
1322+ MinFlatWgrpSize = FlatWgrpSizeAttr->first ;
1323+ MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second );
1324+ }
1325+
1326+ // Start with the max range.
1327+ unsigned Min = MinWavesPerEU;
1328+ unsigned Max = MaxWavesPerEU;
1329+
1330+ // If the attribute exists, set them to the value from the attribute.
1331+ if (WavesPerEUAttr.has_value ()) {
1332+ Min = WavesPerEUAttr->first ;
1333+ if (WavesPerEUAttr->second .has_value ())
1334+ Max = *(WavesPerEUAttr->second );
1335+ }
1336+
1337+ // Compute the range from flat workgroup size.
1338+ auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1339+ ST.getWavesPerEU (F, std::make_pair (MinFlatWgrpSize, MaxFlatWgrpSize));
1340+
1341+ // For the lower bound, we have to "tighten" it.
1342+ Min = std::max (Min, MinFromFlatWgrpSize);
1343+ // For the upper bound, we have to "extend" it.
1344+ Max = std::max (Max, MaxFromFlatWgrpSize);
1345+
1346+ // Clamp the range to the max range.
1347+ Min = std::max (Min, MinWavesPerEU);
1348+ Max = std::min (Max, MaxWavesPerEU);
1349+
1350+ // Update the attribute if it is not the max.
1351+ if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1352+ SmallString<10 > Buffer;
1353+ raw_svector_ostream OS (Buffer);
1354+ OS << Min << ' ,' << Max;
1355+ F.addFnAttr (" amdgpu-waves-per-eu" , OS.str ());
1356+ }
1357+ }
1358+ }
1359+
13321360static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
13331361 AMDGPUAttributorOptions Options,
13341362 ThinOrFullLTOPhase LTOPhase) {
@@ -1418,8 +1446,14 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14181446 }
14191447 }
14201448
1421- ChangeStatus Change = A.run ();
1422- return Change == ChangeStatus::CHANGED;
1449+ bool Changed = A.run () == ChangeStatus::CHANGED;
1450+
1451+ if (Changed && (LTOPhase == ThinOrFullLTOPhase::None ||
1452+ LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
1453+ LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink))
1454+ checkWavesPerEU (M, TM);
1455+
1456+ return Changed;
14231457}
14241458
14251459class AMDGPUAttributorLegacy : public ModulePass {
0 commit comments