From fd3ae0b2bfa2aff67b2b1fc14b36cac185a9cafe Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 18 Oct 2024 23:05:51 +0400
Subject: [PATCH 1/7] AMDGPU: Propagate amdgpu-max-num-workgroups attribute

I'm not sure what the interpretation of 0 is supposed to be,
AMDGPUUsage doesn't say.
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 154 +++++++++++-
 ...ttr-amdgpu-max-num-workgroups-propagate.ll | 228 ++++++++++++++++++
 2 files changed, 380 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 2ae34636005ea..d7a7db5b556dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -179,6 +179,11 @@ class AMDGPUInformationCache : public InformationCache {
     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
   }
 
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    return ST.getMaxNumWorkGroups(F);
+  }
+
   /// Get code object version.
   unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
 
@@ -821,6 +826,150 @@ AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
       "AAAMDFlatWorkGroupSize is only valid for function position");
 }
 
+struct TupleDecIntegerRangeState : public AbstractState {
+  DecIntegerState<uint32_t> X, Y, Z;
+
+  bool isValidState() const override {
+    return X.isValidState() && Y.isValidState() && Z.isValidState();
+  }
+
+  bool isAtFixpoint() const override {
+    return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
+  }
+
+  ChangeStatus indicateOptimisticFixpoint() override {
+    return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
+           Z.indicateOptimisticFixpoint();
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
+           Z.indicatePessimisticFixpoint();
+  }
+
+  TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
+    X ^= Other.X;
+    Y ^= Other.Y;
+    Z ^= Other.Z;
+    return *this;
+  }
+
+  bool operator==(const TupleDecIntegerRangeState &Other) const {
+    return X == Other.X && Y == Other.Y && Z == Other.Z;
+  }
+
+  TupleDecIntegerRangeState &getAssumed() { return *this; }
+  const TupleDecIntegerRangeState &getAssumed() const { return *this; }
+};
+
+using AAAMDMaxNumWorkgroupsState =
+    StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
+
+/// Propagate amdgpu-max-num-workgroups attribute.
+struct AAAMDMaxNumWorkgroups
+    : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
+  using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
+
+  AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+  void initialize(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+    SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
+
+    // FIXME: What is the interpretation of 0?
+    for (unsigned &Entry : MaxNumWorkgroups) {
+      if (Entry == 0)
+        Entry = std::numeric_limits<uint32_t>::max();
+    }
+
+    X.takeKnownMinimum(MaxNumWorkgroups[0]);
+    Y.takeKnownMinimum(MaxNumWorkgroups[1]);
+    Z.takeKnownMinimum(MaxNumWorkgroups[2]);
+
+    if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+      indicatePessimisticFixpoint();
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+    auto CheckCallSite = [&](AbstractCallSite CS) {
+      Function *Caller = CS.getInstruction()->getFunction();
+      LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
+                        << "->" << getAssociatedFunction()->getName() << '\n');
+
+      const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
+          *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+      if (!CallerInfo)
+        return false;
+
+      Change |=
+          clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
+      return true;
+    };
+
+    bool AllCallSitesKnown = true;
+    if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
+
+    return Change;
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
+                                                  Attributor &A);
+
+  ChangeStatus manifest(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    // TODO: Skip adding if worst case?
+    LLVMContext &Ctx = F->getContext();
+    SmallString<32> Buffer;
+    raw_svector_ostream OS(Buffer);
+    OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
+
+    // TODO: Should annotate loads of the group size for this to do anything
+    // useful.
+    return A.manifestAttrs(
+        getIRPosition(),
+        {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
+        /* ForceReplace= */ true);
+  }
+
+  const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; }
+
+  const std::string getAsStr(Attributor *) const override {
+    std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
+    raw_string_ostream OS(Buffer);
+    OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
+       << ']';
+    return OS.str();
+  }
+
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAAMDMaxNumWorkgroups
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  void trackStatistics() const override {}
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+const char AAAMDMaxNumWorkgroups::ID = 0;
+
+AAAMDMaxNumWorkgroups &
+AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
+  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+    return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
+  llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
+}
+
 /// Propagate amdgpu-waves-per-eu attribute.
 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
@@ -1046,8 +1195,8 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
   DenseSet<const char *> Allowed(
       {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
        &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
-       &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
-       &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
+       &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
+       &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
        &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
        &AAInstanceInfo::ID});
 
@@ -1071,6 +1220,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
   for (auto *F : Functions) {
     A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
     A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
+    A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
     A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
     CallingConv::ID CC = F->getCallingConv();
     if (!AMDGPU::isEntryFunctionCC(CC)) {
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
new file mode 100644
index 0000000000000..bd1aa6e6ed470
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
+
+; External call to avoid inferring argument attributes. This makes the
+; final attribute groups easier to read
+declare void @dummy()
+
+define void @extern_callee() {
+; CHECK-LABEL: define void @extern_callee(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define internal void @callee_1_2_3() {
+; CHECK-LABEL: define internal void @callee_1_2_3(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_1_2_3() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2_3(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @callee_1_2_3()
+; CHECK-NEXT:    call void @extern_callee()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @callee_1_2_3()
+  call void @extern_callee()
+  call void @dummy()
+  ret void
+}
+
+attributes #0 = {"amdgpu-max-num-workgroups"="1,2,3"}
+
+; -> 100,10,99
+define internal void @callee_merge_100_8_32__16_10_99() {
+; CHECK-LABEL: define internal void @callee_merge_100_8_32__16_10_99(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_100_8_32() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_100_8_32(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    call void @callee_merge_100_8_32__16_10_99()
+; CHECK-NEXT:    ret void
+;
+  call void @callee_merge_100_8_32__16_10_99()
+  ret void
+}
+
+attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"}
+
+define amdgpu_kernel void @kernel_16_10_99() #2 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_16_10_99(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    call void @callee_merge_100_8_32__16_10_99()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @callee_merge_100_8_32__16_10_99()
+  call void @dummy()
+  ret void
+}
+
+attributes #2 = {"amdgpu-max-num-workgroups"="16,10,99"}
+
+define internal void @merge_to_worst_case() {
+; CHECK-LABEL: define internal void @merge_to_worst_case(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define internal void @callee_x_worst_case() {
+; CHECK-LABEL: define internal void @callee_x_worst_case(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_x_maximum() #3 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_x_maximum(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT:    call void @merge_to_worst_case()
+; CHECK-NEXT:    call void @callee_x_worst_case()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_to_worst_case()
+  call void @callee_x_worst_case()
+  call void @dummy()
+  ret void
+}
+
+attributes #3 = {"amdgpu-max-num-workgroups"="4294967295,1,1"}
+
+define amdgpu_kernel void @kernel_y_maximum() #4 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_y_maximum(
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT:    call void @merge_to_worst_case()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_to_worst_case()
+  call void @dummy()
+  ret void
+}
+
+attributes #4 = {"amdgpu-max-num-workgroups"="1,4294967295,1"}
+
+define amdgpu_kernel void @kernel_z_maximum() #5 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_z_maximum(
+; CHECK-SAME: ) #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT:    call void @merge_to_worst_case()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_to_worst_case()
+  call void @dummy()
+  ret void
+}
+
+attributes #5 = {"amdgpu-max-num-workgroups"="1,1,4294967295"}
+
+; Make sure the attribute isn't lost from the callee.
+define internal void @annotated_callee_from_unannotated_kernel() #6 {
+; CHECK-LABEL: define internal void @annotated_callee_from_unannotated_kernel(
+; CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+attributes #6 = {"amdgpu-max-num-workgroups"="42,99,123"}
+
+define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee()  {
+; CHECK-LABEL: define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @annotated_callee_from_unannotated_kernel()
+; CHECK-NEXT:    ret void
+;
+  call void @annotated_callee_from_unannotated_kernel()
+  ret void
+}
+
+
+define internal void @annotated_callee_merge_caller() #7 {
+; CHECK-LABEL: define internal void @annotated_callee_merge_caller(
+; CHECK-SAME: ) #[[ATTR9:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+attributes #7 = {"amdgpu-max-num-workgroups"="512,256,1024"}
+
+define amdgpu_kernel void @call_annotated_callee_merge_caller() #8 {
+; CHECK-LABEL: define amdgpu_kernel void @call_annotated_callee_merge_caller(
+; CHECK-SAME: ) #[[ATTR10:[0-9]+]] {
+; CHECK-NEXT:    call void @annotated_callee_merge_caller()
+; CHECK-NEXT:    ret void
+;
+  call void @annotated_callee_merge_caller()
+  ret void
+}
+
+attributes #8 = {"amdgpu-max-num-workgroups"="256,128,2048"}
+
+define internal void @called_by_explicit_worst_case() {
+; CHECK-LABEL: define internal void @called_by_explicit_worst_case(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_explicit_worst_case() #9 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_explicit_worst_case(
+; CHECK-SAME: ) #[[ATTR11:[0-9]+]] {
+; CHECK-NEXT:    call void @called_by_explicit_worst_case()
+; CHECK-NEXT:    ret void
+;
+  call void @called_by_explicit_worst_case()
+  ret void
+}
+
+attributes #9 = {"amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295"}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+;.

From 41bb72b02a2983e54fb4f0fa6a6625ed65f27d7a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 21 Oct 2024 16:59:28 -0700
Subject: [PATCH 2/7] Comment parameter name

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index d7a7db5b556dd..8c762ff9d1ce0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -911,7 +911,9 @@ struct AAAMDMaxNumWorkgroups
     };
 
     bool AllCallSitesKnown = true;
-    if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
+    if (!A.checkForAllCallSites(CheckCallSite, *this,
+                                /*RequireAllCallSites=*/true,
+                                AllCallSitesKnown))
       return indicatePessimisticFixpoint();
 
     return Change;

From dfe5c82636592a086e63afaed71afb801d42fec8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 21 Oct 2024 17:03:32 -0700
Subject: [PATCH 3/7] Add a shader entry test

---
 .../AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
index bd1aa6e6ed470..366432e0fc6cb 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
@@ -62,6 +62,16 @@ define amdgpu_kernel void @kernel_100_8_32() #1 {
   ret void
 }
 
+define amdgpu_cs void @amdgpu_cs_100_8_32() #1 {
+; CHECK-LABEL: define amdgpu_cs void @amdgpu_cs_100_8_32(
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-NEXT:    call void @callee_merge_100_8_32__16_10_99()
+; CHECK-NEXT:    ret void
+;
+  call void @callee_merge_100_8_32__16_10_99()
+  ret void
+}
+
 attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"}
 
 define amdgpu_kernel void @kernel_16_10_99() #2 {

From 7994c830fc7077bb3f8f8033a8b98b131e5c86c7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 21 Oct 2024 17:08:05 -0700
Subject: [PATCH 4/7] Check isValidState

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 8c762ff9d1ce0..78e1c8c5b82cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -902,7 +902,7 @@ struct AAAMDMaxNumWorkgroups
 
       const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
-      if (!CallerInfo)
+      if (!CallerInfo || !CallerInfo->isValidState())
         return false;
 
       Change |=

From eaf4a265e1bc8d4a77fd47a1e9754013cd24df83 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 5 Nov 2024 13:09:45 -0800
Subject: [PATCH 5/7] Remove fixup of 0

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 78e1c8c5b82cd..439ce53fd1fdf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -878,12 +878,6 @@ struct AAAMDMaxNumWorkgroups
 
     SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
 
-    // FIXME: What is the interpretation of 0?
-    for (unsigned &Entry : MaxNumWorkgroups) {
-      if (Entry == 0)
-        Entry = std::numeric_limits<uint32_t>::max();
-    }
-
     X.takeKnownMinimum(MaxNumWorkgroups[0]);
     Y.takeKnownMinimum(MaxNumWorkgroups[1]);
     Z.takeKnownMinimum(MaxNumWorkgroups[2]);

From 2ad2f35ec4a3748d1567ebcd3050aa5657d61157 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 6 Nov 2024 13:57:58 -0800
Subject: [PATCH 6/7] Remove todo

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 439ce53fd1fdf..b6e968ef82c96 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -919,7 +919,6 @@ struct AAAMDMaxNumWorkgroups
 
   ChangeStatus manifest(Attributor &A) override {
     Function *F = getAssociatedFunction();
-    // TODO: Skip adding if worst case?
     LLVMContext &Ctx = F->getContext();
     SmallString<32> Buffer;
     raw_svector_ostream OS(Buffer);

From cc4a77290bc498c22cf5b848c39e4effc8103ba5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 19 Oct 2024 02:18:45 +0400
Subject: [PATCH 7/7] AMDGPU: Mark grid size loads with range metadata

Only handles the v5 case.
---
 .../AMDGPU/AMDGPULowerKernelAttributes.cpp    |  33 ++++-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   1 +
 ...amdgpu-max-num-workgroups-load-annotate.ll | 124 ++++++++++++++++++
 3 files changed, 154 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 1bb5e794da7dd..5fc0c36359b6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 
@@ -83,6 +84,20 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
 
 } // end anonymous namespace
 
+static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
+                                            uint32_t MaxNumGroups) {
+  if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
+    return;
+
+  if (!Load->getType()->isIntegerTy(32))
+    return;
+
+  // TODO: If there is existing range metadata, preserve it if it is stricter.
+  MDBuilder MDB(Load->getContext());
+  MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
+  Load->setMetadata(LLVMContext::MD_range, Range);
+}
+
 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   Function *F = CI->getParent()->getParent();
 
@@ -92,7 +107,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   const bool HasUniformWorkGroupSize =
     F->getFnAttribute("uniform-work-group-size").getValueAsBool();
 
-  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
+  SmallVector<unsigned> MaxNumWorkgroups =
+      AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3);
+
+  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+      none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
     return false;
 
   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
@@ -133,16 +152,22 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
       switch (Offset) {
       case HIDDEN_BLOCK_COUNT_X:
-        if (LoadSize == 4)
+        if (LoadSize == 4) {
           BlockCounts[0] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
+        }
         break;
       case HIDDEN_BLOCK_COUNT_Y:
-        if (LoadSize == 4)
+        if (LoadSize == 4) {
           BlockCounts[1] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
+        }
         break;
       case HIDDEN_BLOCK_COUNT_Z:
-        if (LoadSize == 4)
+        if (LoadSize == 4) {
           BlockCounts[2] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
+        }
         break;
       case HIDDEN_GROUP_SIZE_X:
         if (LoadSize == 2)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 54b17ca2cffb1..b18ce90cf45db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -369,6 +369,7 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
       TM.getSubtarget<R600Subtarget>(F));
 }
 
+// FIXME: This has no reason to be in subtarget
 SmallVector<unsigned>
 AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
   return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3,
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
new file mode 100644
index 0000000000000..9064292129928
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
+
+define i32 @use_grid_size_x_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4, !range !0
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_y_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_y_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4
+; CHECK-NEXT:    [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_Y]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4
+  %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4
+  ret i32 %grid.size.y
+}
+
+define i32 @use_grid_size_z_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_z_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8
+; CHECK-NEXT:    [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_Z]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8
+  %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4
+  ret i32 %grid.size.z
+}
+
+define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type() #0 {
+; CHECK-LABEL: define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT:    ret <2 x i16> [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load <2 x i16>, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret <2 x i16> %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max() #2 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_zero() #3 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_zero(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
+
+attributes #0 = { "amdgpu-max-num-workgroups"="36,42,89" }
+attributes #1 = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+attributes #2 = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
+
+!0 = !{i32 0, i32 -1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[RNG0]] = !{i32 1, i32 37}
+; CHECK: [[RNG1]] = !{i32 1, i32 43}
+; CHECK: [[RNG2]] = !{i32 1, i32 90}
+; CHECK: [[RNG3]] = !{i32 1, i32 -1}
+;.