@@ -1109,47 +1109,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11091109    Function *F = getAssociatedFunction ();
11101110    auto  &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
11111111
1112-     auto  TakeRange = [&](std::pair<unsigned , unsigned > R) {
1113-       auto  [Min, Max] = R;
1114-       ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1115-       IntegerRangeState RangeState (Range);
1116-       clampStateAndIndicateChange (this ->getState (), RangeState);
1117-       indicateOptimisticFixpoint ();
1118-     };
1119- 
1120-     std::pair<unsigned , unsigned > MaxWavesPerEURange{
1121-         1U , InfoCache.getMaxWavesPerEU (*F)};
1122- 
11231112    //  If the attribute exists, we will honor it if it is not the default.
11241113    if  (auto  Attr = InfoCache.getWavesPerEUAttr (*F)) {
1114+       std::pair<unsigned , unsigned > MaxWavesPerEURange{
1115+           1U , InfoCache.getMaxWavesPerEU (*F)};
11251116      if  (*Attr != MaxWavesPerEURange) {
1126-         TakeRange (*Attr);
1117+         auto  [Min, Max] = *Attr;
1118+         ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1119+         IntegerRangeState RangeState (Range);
1120+         this ->getState () = RangeState;
1121+         indicateOptimisticFixpoint ();
11271122        return ;
11281123      }
11291124    }
11301125
1131-     //  Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1132-     //  calculation of waves per EU involves flat work group size, we can't
1133-     //  simply use an assumed flat work group size as a start point, because the
1134-     //  update of flat work group size is in an inverse direction of waves per
1135-     //  EU. However, we can still do something if it is an entry function. Since
1136-     //  an entry function is a terminal node, and flat work group size either
1137-     //  from attribute or default will be used anyway, we can take that value and
1138-     //  calculate the waves per EU based on it. This result can't be updated by
1139-     //  no means, but that could still allow us to propagate it.
1140-     if  (AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1141-       std::pair<unsigned , unsigned > FlatWorkGroupSize;
1142-       if  (auto  Attr = InfoCache.getFlatWorkGroupSizeAttr (*F))
1143-         FlatWorkGroupSize = *Attr;
1144-       else 
1145-         FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize (*F);
1146-       TakeRange (InfoCache.getEffectiveWavesPerEU (*F, MaxWavesPerEURange,
1147-                                                  FlatWorkGroupSize));
1148-     }
1126+     if  (AMDGPU::isEntryFunctionCC (F->getCallingConv ()))
1127+       indicatePessimisticFixpoint ();
11491128  }
11501129
11511130  ChangeStatus updateImpl (Attributor &A) override  {
1152-     auto  &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
11531131    ChangeStatus Change = ChangeStatus::UNCHANGED;
11541132
11551133    auto  CheckCallSite = [&](AbstractCallSite CS) {
@@ -1158,24 +1136,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11581136      LLVM_DEBUG (dbgs () << ' ['   << getName () << " ] Call "   << Caller->getName ()
11591137                        << " ->"   << Func->getName () << ' \n '  );
11601138
1161-       const  auto  *CallerInfo  = A.getAAFor <AAAMDWavesPerEU>(
1139+       const  auto  *CallerAA  = A.getAAFor <AAAMDWavesPerEU>(
11621140          *this , IRPosition::function (*Caller), DepClassTy::REQUIRED);
1163-       const  auto  *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1164-           *this , IRPosition::function (*Func), DepClassTy::REQUIRED);
1165-       if  (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState () ||
1166-           !AssumedGroupSize->isValidState ())
1141+       if  (!CallerAA || !CallerAA->isValidState ())
11671142        return  false ;
11681143
1169-       unsigned  Min, Max ;
1170-       std::tie  (Min, Max) = InfoCache. getEffectiveWavesPerEU ( 
1171-           *Caller, 
1172-           {CallerInfo-> getAssumed (). getLower ().getZExtValue (),
1173-            CallerInfo ->getAssumed ().getUpper ().getZExtValue () -  1 }, 
1174-           {AssumedGroupSize-> getAssumed (). getLower (). getZExtValue (), 
1175-            AssumedGroupSize-> getAssumed (). getUpper (). getZExtValue () -  1 } );
1176-       ConstantRange  CallerRange ( APInt ( 32 , Min),  APInt ( 32 , Max +  1 )) ;
1177-       IntegerRangeState  CallerRangeState (CallerRange); 
1178-       Change |=  clampStateAndIndicateChange ( this -> getState (), CallerRangeState) ;
1144+       auto  Assumed =  this -> getAssumed () ;
1145+       unsigned  Min =  std::max  (Assumed. getLower (). getZExtValue (), 
1146+                               CallerAA-> getAssumed (). getLower (). getZExtValue ()); 
1147+       unsigned  Max =  std::max (Assumed. getUpper ().getZExtValue (),
1148+                               CallerAA ->getAssumed ().getUpper ().getZExtValue ()); 
1149+       ConstantRange  Range ( APInt ( 32 , Min),  APInt ( 32 , Max)); 
1150+       IntegerRangeState  RangeState (Range );
1151+       this -> getState () = RangeState ;
1152+       Change |=  this -> getState () == Assumed ? ChangeStatus::UNCHANGED 
1153+                                             : ChangeStatus::CHANGED ;
11791154
11801155      return  true ;
11811156    };
@@ -1329,6 +1304,59 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
13291304  }
13301305}
13311306
1307+ static  void  checkWavesPerEU (Module &M, TargetMachine &TM) {
1308+   for  (Function &F : M) {
1309+     const  GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
1310+ 
1311+     auto  FlatWgrpSizeAttr =
1312+         AMDGPU::getIntegerPairAttribute (F, " amdgpu-flat-work-group-size"  );
1313+     auto  WavesPerEUAttr = AMDGPU::getIntegerPairAttribute (
1314+         F, " amdgpu-waves-per-eu"  , /* OnlyFirstRequired=*/ true );
1315+ 
1316+     unsigned  MinWavesPerEU = ST.getMinWavesPerEU ();
1317+     unsigned  MaxWavesPerEU = ST.getMaxWavesPerEU ();
1318+ 
1319+     unsigned  MinFlatWgrpSize = 1U ;
1320+     unsigned  MaxFlatWgrpSize = 1024U ;
1321+     if  (FlatWgrpSizeAttr.has_value ()) {
1322+       MinFlatWgrpSize = FlatWgrpSizeAttr->first ;
1323+       MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second );
1324+     }
1325+ 
1326+     //  Start with the max range.
1327+     unsigned  Min = MinWavesPerEU;
1328+     unsigned  Max = MaxWavesPerEU;
1329+ 
1330+     //  If the attribute exists, set them to the value from the attribute.
1331+     if  (WavesPerEUAttr.has_value ()) {
1332+       Min = WavesPerEUAttr->first ;
1333+       if  (WavesPerEUAttr->second .has_value ())
1334+         Max = *(WavesPerEUAttr->second );
1335+     }
1336+ 
1337+     //  Compute the range from flat workgroup size.
1338+     auto  [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1339+         ST.getWavesPerEU (F, std::make_pair (MinFlatWgrpSize, MaxFlatWgrpSize));
1340+ 
1341+     //  For the lower bound, we have to "tighten" it.
1342+     Min = std::max (Min, MinFromFlatWgrpSize);
1343+     //  For the upper bound, we have to "extend" it.
1344+     Max = std::max (Max, MaxFromFlatWgrpSize);
1345+ 
1346+     //  Clamp the range to the max range.
1347+     Min = std::max (Min, MinWavesPerEU);
1348+     Max = std::min (Max, MaxWavesPerEU);
1349+ 
1350+     //  Update the attribute if it is not the max.
1351+     if  (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1352+       SmallString<10 > Buffer;
1353+       raw_svector_ostream OS (Buffer);
1354+       OS << Min << ' ,'   << Max;
1355+       F.addFnAttr (" amdgpu-waves-per-eu"  , OS.str ());
1356+     }
1357+   }
1358+ }
1359+ 
13321360static  bool  runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
13331361                    AMDGPUAttributorOptions Options,
13341362                    ThinOrFullLTOPhase LTOPhase) {
@@ -1418,8 +1446,14 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14181446    }
14191447  }
14201448
1421-   ChangeStatus Change = A.run ();
1422-   return  Change == ChangeStatus::CHANGED;
1449+   bool  Changed = A.run () == ChangeStatus::CHANGED;
1450+ 
1451+   if  (Changed && (LTOPhase == ThinOrFullLTOPhase::None ||
1452+                   LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
1453+                   LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink))
1454+     checkWavesPerEU (M, TM);
1455+ 
1456+   return  Changed;
14231457}
14241458
14251459class  AMDGPUAttributorLegacy  : public  ModulePass  {
0 commit comments