llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp‎
Lines changed: 82 additions & 48 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp‎
Lines changed: 82 additions & 48 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll‎
Lines changed: 3 additions & 3 deletions b/‎llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll‎
Lines changed: 6 additions & 6 deletions b/‎llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll‎
Lines changed: 10 additions & 10 deletions b/‎llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll‎
Lines changed: 10 additions & 10 deletions
@@ -1109,47 +1109,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
     Function *F = getAssociatedFunction();
     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
 
-    auto TakeRange = [&](std::pair<unsigned, unsigned> R) {
-      auto [Min, Max] = R;
-      ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
-      IntegerRangeState RangeState(Range);
-      clampStateAndIndicateChange(this->getState(), RangeState);
-      indicateOptimisticFixpoint();
-    };
-
-    std::pair<unsigned, unsigned> MaxWavesPerEURange{
-        1U, InfoCache.getMaxWavesPerEU(*F)};
-
     // If the attribute exists, we will honor it if it is not the default.
     if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
+      std::pair<unsigned, unsigned> MaxWavesPerEURange{
+          1U, InfoCache.getMaxWavesPerEU(*F)};
       if (*Attr != MaxWavesPerEURange) {
-        TakeRange(*Attr);
+        auto [Min, Max] = *Attr;
+        ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
+        IntegerRangeState RangeState(Range);
+        this->getState() = RangeState;
+        indicateOptimisticFixpoint();
         return;
       }
     }
 
-    // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
-    // calculation of waves per EU involves flat work group size, we can't
-    // simply use an assumed flat work group size as a start point, because the
-    // update of flat work group size is in an inverse direction of waves per
-    // EU. However, we can still do something if it is an entry function. Since
-    // an entry function is a terminal node, and flat work group size either
-    // from attribute or default will be used anyway, we can take that value and
-    // calculate the waves per EU based on it. This result can't be updated by
-    // no means, but that could still allow us to propagate it.
-    if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
-      std::pair<unsigned, unsigned> FlatWorkGroupSize;
-      if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
-        FlatWorkGroupSize = *Attr;
-      else
-        FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
-      TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
-                                                 FlatWorkGroupSize));
-    }
+    if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+      indicatePessimisticFixpoint();
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
-    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
     ChangeStatus Change = ChangeStatus::UNCHANGED;
 
     auto CheckCallSite = [&](AbstractCallSite CS) {
@@ -1158,24 +1136,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
                         << "->" << Func->getName() << '\n');
 
-      const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
+      const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
-      const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
-          *this, IRPosition::function(*Func), DepClassTy::REQUIRED);
-      if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState() ||
-          !AssumedGroupSize->isValidState())
+      if (!CallerAA || !CallerAA->isValidState())
         return false;
 
-      unsigned Min, Max;
-      std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
-          *Caller,
-          {CallerInfo->getAssumed().getLower().getZExtValue(),
-           CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
-          {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-           AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
-      ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
-      IntegerRangeState CallerRangeState(CallerRange);
-      Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
+      auto Assumed = this->getAssumed();
+      unsigned Min = std::max(Assumed.getLower().getZExtValue(),
+                              CallerAA->getAssumed().getLower().getZExtValue());
+      unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
+                              CallerAA->getAssumed().getUpper().getZExtValue());
+      ConstantRange Range(APInt(32, Min), APInt(32, Max));
+      IntegerRangeState RangeState(Range);
+      this->getState() = RangeState;
+      Change |= this->getState() == Assumed ? ChangeStatus::UNCHANGED
+                                            : ChangeStatus::CHANGED;
 
       return true;
     };
@@ -1329,6 +1304,59 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
   }
 }
 
+static void checkWavesPerEU(Module &M, TargetMachine &TM) {
+  for (Function &F : M) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+
+    auto FlatWgrpSizeAttr =
+        AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
+    auto WavesPerEUAttr = AMDGPU::getIntegerPairAttribute(
+        F, "amdgpu-waves-per-eu", /*OnlyFirstRequired=*/true);
+
+    unsigned MinWavesPerEU = ST.getMinWavesPerEU();
+    unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
+
+    unsigned MinFlatWgrpSize = 1U;
+    unsigned MaxFlatWgrpSize = 1024U;
+    if (FlatWgrpSizeAttr.has_value()) {
+      MinFlatWgrpSize = FlatWgrpSizeAttr->first;
+      MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
+    }
+
+    // Start with the max range.
+    unsigned Min = MinWavesPerEU;
+    unsigned Max = MaxWavesPerEU;
+
+    // If the attribute exists, set them to the value from the attribute.
+    if (WavesPerEUAttr.has_value()) {
+      Min = WavesPerEUAttr->first;
+      if (WavesPerEUAttr->second.has_value())
+        Max = *(WavesPerEUAttr->second);
+    }
+
+    // Compute the range from flat workgroup size.
+    auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
+        ST.getWavesPerEU(F, std::make_pair(MinFlatWgrpSize, MaxFlatWgrpSize));
+
+    // For the lower bound, we have to "tighten" it.
+    Min = std::max(Min, MinFromFlatWgrpSize);
+    // For the upper bound, we have to "extend" it.
+    Max = std::max(Max, MaxFromFlatWgrpSize);
+
+    // Clamp the range to the max range.
+    Min = std::max(Min, MinWavesPerEU);
+    Max = std::min(Max, MaxWavesPerEU);
+
+    // Update the attribute if it is not the max.
+    if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
+      SmallString<10> Buffer;
+      raw_svector_ostream OS(Buffer);
+      OS << Min << ',' << Max;
+      F.addFnAttr("amdgpu-waves-per-eu", OS.str());
+    }
+  }
+}
+
 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
                     AMDGPUAttributorOptions Options,
                     ThinOrFullLTOPhase LTOPhase) {
@@ -1418,8 +1446,14 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
     }
   }
 
-  ChangeStatus Change = A.run();
-  return Change == ChangeStatus::CHANGED;
+  bool Changed = A.run() == ChangeStatus::CHANGED;
+
+  if (Changed && (LTOPhase == ThinOrFullLTOPhase::None ||
+                  LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+                  LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink))
+    checkWavesPerEU(M, TM);
+
+  return Changed;
 }
 
 class AMDGPUAttributorLegacy : public ModulePass {
 
@@ -232,9 +232,9 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
 ;.
-; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "amdgpu-waves-per-eu"="4,10" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
 
@@ -254,11 +254,11 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 
 attributes #0 = { "amdgpu-no-agpr" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-agpr" }
 ;.
@@ -117,14 +117,14 @@ define void @call_no_dispatch_id() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.