From fd3ae0b2bfa2aff67b2b1fc14b36cac185a9cafe Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 18 Oct 2024 23:05:51 +0400 Subject: [PATCH 1/7] AMDGPU: Propagate amdgpu-max-num-workgroups attribute I'm not sure what the interpretation of 0 is supposed to be, AMDGPUUsage doesn't say. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 154 +++++++++++- ...ttr-amdgpu-max-num-workgroups-propagate.ll | 228 ++++++++++++++++++ 2 files changed, 380 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 2ae34636005ea..d7a7db5b556dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -179,6 +179,11 @@ class AMDGPUInformationCache : public InformationCache { return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; } + SmallVector getMaxNumWorkGroups(const Function &F) { + const GCNSubtarget &ST = TM.getSubtarget(F); + return ST.getMaxNumWorkGroups(F); + } + /// Get code object version. unsigned getCodeObjectVersion() const { return CodeObjectVersion; } @@ -821,6 +826,150 @@ AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, "AAAMDFlatWorkGroupSize is only valid for function position"); } +struct TupleDecIntegerRangeState : public AbstractState { + DecIntegerState X, Y, Z; + + bool isValidState() const override { + return X.isValidState() && Y.isValidState() && Z.isValidState(); + } + + bool isAtFixpoint() const override { + return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint(); + } + + ChangeStatus indicateOptimisticFixpoint() override { + return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() | + Z.indicateOptimisticFixpoint(); + } + + ChangeStatus indicatePessimisticFixpoint() override { + return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() | + Z.indicatePessimisticFixpoint(); + } + + TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) { + X ^= Other.X; + Y ^= Other.Y; + Z ^= Other.Z; + return *this; + } + + bool operator==(const TupleDecIntegerRangeState &Other) const { + return X == Other.X && Y == Other.Y && Z == Other.Z; + } + + TupleDecIntegerRangeState &getAssumed() { return *this; } + const TupleDecIntegerRangeState &getAssumed() const { return *this; } +}; + +using AAAMDMaxNumWorkgroupsState = + StateWrapper; + +/// Propagate amdgpu-max-num-workgroups attribute. +struct AAAMDMaxNumWorkgroups + : public StateWrapper { + using Base = StateWrapper; + + AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + auto &InfoCache = static_cast(A.getInfoCache()); + + SmallVector MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F); + + // FIXME: What is the interpretation of 0? + for (unsigned &Entry : MaxNumWorkgroups) { + if (Entry == 0) + Entry = std::numeric_limits::max(); + } + + X.takeKnownMinimum(MaxNumWorkgroups[0]); + Y.takeKnownMinimum(MaxNumWorkgroups[1]); + Z.takeKnownMinimum(MaxNumWorkgroups[2]); + + if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) + indicatePessimisticFixpoint(); + } + + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus Change = ChangeStatus::UNCHANGED; + + auto CheckCallSite = [&](AbstractCallSite CS) { + Function *Caller = CS.getInstruction()->getFunction(); + LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName() + << "->" << getAssociatedFunction()->getName() << '\n'); + + const auto *CallerInfo = A.getAAFor( + *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + if (!CallerInfo) + return false; + + Change |= + clampStateAndIndicateChange(this->getState(), CallerInfo->getState()); + return true; + }; + + bool AllCallSitesKnown = true; + if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) + return indicatePessimisticFixpoint(); + + return Change; + } + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP, + Attributor &A); + + ChangeStatus manifest(Attributor &A) override { + Function *F = getAssociatedFunction(); + // TODO: Skip adding if worst case? + LLVMContext &Ctx = F->getContext(); + SmallString<32> Buffer; + raw_svector_ostream OS(Buffer); + OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed(); + + // TODO: Should annotate loads of the group size for this to do anything + // useful. + return A.manifestAttrs( + getIRPosition(), + {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())}, + /* ForceReplace= */ true); + } + + const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; } + + const std::string getAsStr(Attributor *) const override { + std::string Buffer = "AAAMDMaxNumWorkgroupsState["; + raw_string_ostream OS(Buffer); + OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed() + << ']'; + return OS.str(); + } + + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDMaxNumWorkgroups + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + void trackStatistics() const override {} + + /// Unique ID (due to the unique address) + static const char ID; +}; + +const char AAAMDMaxNumWorkgroups::ID = 0; + +AAAMDMaxNumWorkgroups & +AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A); + llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position"); +} + /// Propagate amdgpu-waves-per-eu attribute. struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) @@ -1046,8 +1195,8 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, DenseSet Allowed( {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, - &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, - &AAPointerInfo::ID, &AAPotentialConstantValues::ID, + &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, + &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID, &AAInstanceInfo::ID}); @@ -1071,6 +1220,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, for (auto *F : Functions) { A.getOrCreateAAFor(IRPosition::function(*F)); A.getOrCreateAAFor(IRPosition::function(*F)); + A.getOrCreateAAFor(IRPosition::function(*F)); A.getOrCreateAAFor(IRPosition::function(*F)); CallingConv::ID CC = F->getCallingConv(); if (!AMDGPU::isEntryFunctionCC(CC)) { diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll new file mode 100644 index 0000000000000..bd1aa6e6ed470 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s + +; External call to avoid inferring argument attributes. This makes the +; final attribute groups easier to read +declare void @dummy() + +define void @extern_callee() { +; CHECK-LABEL: define void @extern_callee( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define internal void @callee_1_2_3() { +; CHECK-LABEL: define internal void @callee_1_2_3( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_1_2_3() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2_3( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @callee_1_2_3() +; CHECK-NEXT: call void @extern_callee() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @callee_1_2_3() + call void @extern_callee() + call void @dummy() + ret void +} + +attributes #0 = {"amdgpu-max-num-workgroups"="1,2,3"} + +; -> 100,10,99 +define internal void @callee_merge_100_8_32__16_10_99() { +; CHECK-LABEL: define internal void @callee_merge_100_8_32__16_10_99( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_100_8_32() #1 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_100_8_32( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() +; CHECK-NEXT: ret void +; + call void @callee_merge_100_8_32__16_10_99() + ret void +} + +attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"} + +define amdgpu_kernel void @kernel_16_10_99() #2 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_16_10_99( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @callee_merge_100_8_32__16_10_99() + call void @dummy() + ret void +} + +attributes #2 = {"amdgpu-max-num-workgroups"="16,10,99"} + +define internal void @merge_to_worst_case() { +; CHECK-LABEL: define internal void @merge_to_worst_case( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define internal void @callee_x_worst_case() { +; CHECK-LABEL: define internal void @callee_x_worst_case( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_x_maximum() #3 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_x_maximum( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK-NEXT: call void @merge_to_worst_case() +; CHECK-NEXT: call void @callee_x_worst_case() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @merge_to_worst_case() + call void @callee_x_worst_case() + call void @dummy() + ret void +} + +attributes #3 = {"amdgpu-max-num-workgroups"="4294967295,1,1"} + +define amdgpu_kernel void @kernel_y_maximum() #4 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_y_maximum( +; CHECK-SAME: ) #[[ATTR6:[0-9]+]] { +; CHECK-NEXT: call void @merge_to_worst_case() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @merge_to_worst_case() + call void @dummy() + ret void +} + +attributes #4 = {"amdgpu-max-num-workgroups"="1,4294967295,1"} + +define amdgpu_kernel void @kernel_z_maximum() #5 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_z_maximum( +; CHECK-SAME: ) #[[ATTR7:[0-9]+]] { +; CHECK-NEXT: call void @merge_to_worst_case() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @merge_to_worst_case() + call void @dummy() + ret void +} + +attributes #5 = {"amdgpu-max-num-workgroups"="1,1,4294967295"} + +; Make sure the attribute isn't lost from the callee. +define internal void @annotated_callee_from_unannotated_kernel() #6 { +; CHECK-LABEL: define internal void @annotated_callee_from_unannotated_kernel( +; CHECK-SAME: ) #[[ATTR8:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +attributes #6 = {"amdgpu-max-num-workgroups"="42,99,123"} + +define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee() { +; CHECK-LABEL: define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @annotated_callee_from_unannotated_kernel() +; CHECK-NEXT: ret void +; + call void @annotated_callee_from_unannotated_kernel() + ret void +} + + +define internal void @annotated_callee_merge_caller() #7 { +; CHECK-LABEL: define internal void @annotated_callee_merge_caller( +; CHECK-SAME: ) #[[ATTR9:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +attributes #7 = {"amdgpu-max-num-workgroups"="512,256,1024"} + +define amdgpu_kernel void @call_annotated_callee_merge_caller() #8 { +; CHECK-LABEL: define amdgpu_kernel void @call_annotated_callee_merge_caller( +; CHECK-SAME: ) #[[ATTR10:[0-9]+]] { +; CHECK-NEXT: call void @annotated_callee_merge_caller() +; CHECK-NEXT: ret void +; + call void @annotated_callee_merge_caller() + ret void +} + +attributes #8 = {"amdgpu-max-num-workgroups"="256,128,2048"} + +define internal void @called_by_explicit_worst_case() { +; CHECK-LABEL: define internal void @called_by_explicit_worst_case( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_explicit_worst_case() #9 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_explicit_worst_case( +; CHECK-SAME: ) #[[ATTR11:[0-9]+]] { +; CHECK-NEXT: call void @called_by_explicit_worst_case() +; CHECK-NEXT: ret void +; + call void @called_by_explicit_worst_case() + ret void +} + +attributes #9 = {"amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295"} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +;. From 41bb72b02a2983e54fb4f0fa6a6625ed65f27d7a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 21 Oct 2024 16:59:28 -0700 Subject: [PATCH 2/7] Comment parameter name --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index d7a7db5b556dd..8c762ff9d1ce0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -911,7 +911,9 @@ struct AAAMDMaxNumWorkgroups }; bool AllCallSitesKnown = true; - if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) + if (!A.checkForAllCallSites(CheckCallSite, *this, + /*RequireAllCallSites=*/true, + AllCallSitesKnown)) return indicatePessimisticFixpoint(); return Change; From dfe5c82636592a086e63afaed71afb801d42fec8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 21 Oct 2024 17:03:32 -0700 Subject: [PATCH 3/7] Add a shader entry test --- .../AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll index bd1aa6e6ed470..366432e0fc6cb 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll @@ -62,6 +62,16 @@ define amdgpu_kernel void @kernel_100_8_32() #1 { ret void } +define amdgpu_cs void @amdgpu_cs_100_8_32() #1 { +; CHECK-LABEL: define amdgpu_cs void @amdgpu_cs_100_8_32( +; CHECK-SAME: ) #[[ATTR3]] { +; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() +; CHECK-NEXT: ret void +; + call void @callee_merge_100_8_32__16_10_99() + ret void +} + attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"} define amdgpu_kernel void @kernel_16_10_99() #2 { From 7994c830fc7077bb3f8f8033a8b98b131e5c86c7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 21 Oct 2024 17:08:05 -0700 Subject: [PATCH 4/7] Check isValidState --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 8c762ff9d1ce0..78e1c8c5b82cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -902,7 +902,7 @@ struct AAAMDMaxNumWorkgroups const auto *CallerInfo = A.getAAFor( *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); - if (!CallerInfo) + if (!CallerInfo || !CallerInfo->isValidState()) return false; Change |= From eaf4a265e1bc8d4a77fd47a1e9754013cd24df83 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 5 Nov 2024 13:09:45 -0800 Subject: [PATCH 5/7] Remove fixup of 0 --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 78e1c8c5b82cd..439ce53fd1fdf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -878,12 +878,6 @@ struct AAAMDMaxNumWorkgroups SmallVector MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F); - // FIXME: What is the interpretation of 0? - for (unsigned &Entry : MaxNumWorkgroups) { - if (Entry == 0) - Entry = std::numeric_limits::max(); - } - X.takeKnownMinimum(MaxNumWorkgroups[0]); Y.takeKnownMinimum(MaxNumWorkgroups[1]); Z.takeKnownMinimum(MaxNumWorkgroups[2]); From 2ad2f35ec4a3748d1567ebcd3050aa5657d61157 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 6 Nov 2024 13:57:58 -0800 Subject: [PATCH 6/7] Remove todo --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 439ce53fd1fdf..b6e968ef82c96 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -919,7 +919,6 @@ struct AAAMDMaxNumWorkgroups ChangeStatus manifest(Attributor &A) override { Function *F = getAssociatedFunction(); - // TODO: Skip adding if worst case? LLVMContext &Ctx = F->getContext(); SmallString<32> Buffer; raw_svector_ostream OS(Buffer); From cc4a77290bc498c22cf5b848c39e4effc8103ba5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 19 Oct 2024 02:18:45 +0400 Subject: [PATCH 7/7] AMDGPU: Mark grid size loads with range metadata Only handles the v5 case. --- .../AMDGPU/AMDGPULowerKernelAttributes.cpp | 33 ++++- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 1 + ...amdgpu-max-num-workgroups-load-annotate.ll | 124 ++++++++++++++++++ 3 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index 1bb5e794da7dd..5fc0c36359b6f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" @@ -83,6 +84,20 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { } // end anonymous namespace +static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, + uint32_t MaxNumGroups) { + if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits::max()) + return; + + if (!Load->getType()->isIntegerTy(32)) + return; + + // TODO: If there is existing range metadata, preserve it if it is stricter. + MDBuilder MDB(Load->getContext()); + MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1)); + Load->setMetadata(LLVMContext::MD_range, Range); +} + static bool processUse(CallInst *CI, bool IsV5OrAbove) { Function *F = CI->getParent()->getParent(); @@ -92,7 +107,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { const bool HasUniformWorkGroupSize = F->getFnAttribute("uniform-work-group-size").getValueAsBool(); - if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize) + SmallVector MaxNumWorkgroups = + AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3); + + if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && + none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; })) return false; Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; @@ -133,16 +152,22 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { if (IsV5OrAbove) { // Base is ImplicitArgPtr. switch (Offset) { case HIDDEN_BLOCK_COUNT_X: - if (LoadSize == 4) + if (LoadSize == 4) { BlockCounts[0] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]); + } break; case HIDDEN_BLOCK_COUNT_Y: - if (LoadSize == 4) + if (LoadSize == 4) { BlockCounts[1] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]); + } break; case HIDDEN_BLOCK_COUNT_Z: - if (LoadSize == 4) + if (LoadSize == 4) { BlockCounts[2] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]); + } break; case HIDDEN_GROUP_SIZE_X: if (LoadSize == 2) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 54b17ca2cffb1..b18ce90cf45db 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -369,6 +369,7 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct TM.getSubtarget(F)); } +// FIXME: This has no reason to be in subtarget SmallVector AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3, diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll new file mode 100644 index 0000000000000..9064292129928 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s + +define i32 @use_grid_size_x_max_num_workgroups() #0 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4, !range !0 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_y_max_num_workgroups() #0 { +; CHECK-LABEL: define i32 @use_grid_size_y_max_num_workgroups( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4 +; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_Y]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4 + %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4 + ret i32 %grid.size.y +} + +define i32 @use_grid_size_z_max_num_workgroups() #0 { +; CHECK-LABEL: define i32 @use_grid_size_z_max_num_workgroups( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8 +; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_Z]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8 + %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4 + ret i32 %grid.size.z +} + +define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type() #0 { +; CHECK-LABEL: define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4 +; CHECK-NEXT: ret <2 x i16> [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load <2 x i16>, ptr addrspace(4) %implicitarg.ptr, align 4 + ret <2 x i16> %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_max() #2 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4 +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_zero() #3 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_zero( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4 +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3 + +attributes #0 = { "amdgpu-max-num-workgroups"="36,42,89" } +attributes #1 = { "amdgpu-max-num-workgroups"="4294967294,42,89" } +attributes #2 = { "amdgpu-max-num-workgroups"="4294967295,42,89" } +attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" } + +!0 = !{i32 0, i32 -1} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; CHECK: [[RNG0]] = !{i32 1, i32 37} +; CHECK: [[RNG1]] = !{i32 1, i32 43} +; CHECK: [[RNG2]] = !{i32 1, i32 90} +; CHECK: [[RNG3]] = !{i32 1, i32 -1} +;.