-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[clang][CodeGen][SPIRV] Translate amdgpu_flat_work_group_size into max_work_group_size.
#116820
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[clang][CodeGen][SPIRV] Translate amdgpu_flat_work_group_size into max_work_group_size.
#116820
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-clang Author: Alex Voicu (AlexVlx) ChangesHIPAMD relies on the This patch addresses that limitation by converting the unsupported attribute into the Full diff: https://github.com/llvm/llvm-project/pull/116820.diff 2 Files Affected:
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index a48fe9d5f1ee9c..c35d91b1f49af2 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -64,6 +64,8 @@ class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
const VarDecl *D) const override;
+ void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
+ CodeGen::CodeGenModule &M) const override;
llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
SyncScope Scope,
llvm::AtomicOrdering Ordering,
@@ -245,6 +247,38 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
return DefaultGlobalAS;
}
+void SPIRVTargetCodeGenInfo::setTargetAttributes(
+ const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
+ if (!M.getLangOpts().HIP ||
+ M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
+ return;
+ if (GV->isDeclaration())
+ return;
+
+ auto F = dyn_cast<llvm::Function>(GV);
+ if (!F)
+ return;
+
+ auto FD = dyn_cast_or_null<FunctionDecl>(D);
+ if (!FD)
+ return;
+ if (!FD->hasAttr<CUDAGlobalAttr>())
+ return;
+
+ unsigned N = M.getLangOpts().GPUMaxThreadsPerBlock;
+ if (auto FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>())
+ N = FlatWGS->getMax()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+
+ auto Int32Ty = llvm::IntegerType::getInt32Ty(M.getLLVMContext());
+ llvm::Metadata *AttrMDArgs[] = {
+ llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, N)),
+ llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1)),
+ llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1))};
+
+ F->setMetadata("reqd_work_group_size",
+ llvm::MDNode::get(M.getLLVMContext(), AttrMDArgs));
+}
+
llvm::SyncScope::ID
SPIRVTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &, SyncScope Scope,
llvm::AtomicOrdering,
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
index 11a133fd1351d2..3d01ac40259254 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -4,6 +4,9 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa --gpu-max-threads-per-block=1024 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK-SPIRV,MAX1024-SPIRV %s
// RUN: %clang_cc1 -triple nvptx \
// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
// RUN: -check-prefix=NAMD
@@ -21,12 +24,14 @@
__global__ void flat_work_group_size_default() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
+// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z28flat_work_group_size_defaultv(){{.*}} !reqd_work_group_size [[REQD_WORK_GROUP_SIZE_DEFAULT:![0-9]+]]
// NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]]
}
__attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
__global__ void flat_work_group_size_32_64() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
+// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z26flat_work_group_size_32_64v(){{.*}} !reqd_work_group_size [[REQD_WORK_GROUP_SIZE_64:![0-9]+]]
}
__attribute__((amdgpu_waves_per_eu(2))) // expected-no-diagnostics
__global__ void waves_per_eu_2() {
@@ -82,7 +87,9 @@ template __global__ void template_32_4_a_max_num_work_groups<2>();
// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
+// MAX1024-SPIRV-DAG: [[REQD_WORK_GROUP_SIZE_DEFAULT]] = !{i32 1024, i32 1, i32 1}
// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
+// CHECK-SPIRV-DAG: [[REQD_WORK_GROUP_SIZE_64]] = !{i32 64, i32 1, i32 1}
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
|
|
@llvm/pr-subscribers-backend-amdgpu Author: Alex Voicu (AlexVlx) ChangesHIPAMD relies on the This patch addresses that limitation by converting the unsupported attribute into the Full diff: https://github.com/llvm/llvm-project/pull/116820.diff 2 Files Affected:
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index a48fe9d5f1ee9c..c35d91b1f49af2 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -64,6 +64,8 @@ class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
const VarDecl *D) const override;
+ void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
+ CodeGen::CodeGenModule &M) const override;
llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
SyncScope Scope,
llvm::AtomicOrdering Ordering,
@@ -245,6 +247,38 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
return DefaultGlobalAS;
}
+void SPIRVTargetCodeGenInfo::setTargetAttributes(
+ const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
+ if (!M.getLangOpts().HIP ||
+ M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
+ return;
+ if (GV->isDeclaration())
+ return;
+
+ auto F = dyn_cast<llvm::Function>(GV);
+ if (!F)
+ return;
+
+ auto FD = dyn_cast_or_null<FunctionDecl>(D);
+ if (!FD)
+ return;
+ if (!FD->hasAttr<CUDAGlobalAttr>())
+ return;
+
+ unsigned N = M.getLangOpts().GPUMaxThreadsPerBlock;
+ if (auto FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>())
+ N = FlatWGS->getMax()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+
+ auto Int32Ty = llvm::IntegerType::getInt32Ty(M.getLLVMContext());
+ llvm::Metadata *AttrMDArgs[] = {
+ llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, N)),
+ llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1)),
+ llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1))};
+
+ F->setMetadata("reqd_work_group_size",
+ llvm::MDNode::get(M.getLLVMContext(), AttrMDArgs));
+}
+
llvm::SyncScope::ID
SPIRVTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &, SyncScope Scope,
llvm::AtomicOrdering,
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
index 11a133fd1351d2..3d01ac40259254 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -4,6 +4,9 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa --gpu-max-threads-per-block=1024 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK-SPIRV,MAX1024-SPIRV %s
// RUN: %clang_cc1 -triple nvptx \
// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
// RUN: -check-prefix=NAMD
@@ -21,12 +24,14 @@
__global__ void flat_work_group_size_default() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
+// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z28flat_work_group_size_defaultv(){{.*}} !reqd_work_group_size [[REQD_WORK_GROUP_SIZE_DEFAULT:![0-9]+]]
// NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]]
}
__attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
__global__ void flat_work_group_size_32_64() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
+// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z26flat_work_group_size_32_64v(){{.*}} !reqd_work_group_size [[REQD_WORK_GROUP_SIZE_64:![0-9]+]]
}
__attribute__((amdgpu_waves_per_eu(2))) // expected-no-diagnostics
__global__ void waves_per_eu_2() {
@@ -82,7 +87,9 @@ template __global__ void template_32_4_a_max_num_work_groups<2>();
// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
+// MAX1024-SPIRV-DAG: [[REQD_WORK_GROUP_SIZE_DEFAULT]] = !{i32 1024, i32 1, i32 1}
// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
+// CHECK-SPIRV-DAG: [[REQD_WORK_GROUP_SIZE_64]] = !{i32 64, i32 1, i32 1}
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
|
|
reqd_work_group_size is for OpenCL reqd_work_group_size attribute and it sets exact block size. amdgpu-flat-work-group-size sets a (min, max) range for block size. HIP launch bounds sets a block size range (1, bound). It cannot be represented by reqd_work_group_size. |
This is not quite correct. CUDA defines |
For example, if you use reqd_work_group_size to represent launch_bounds(1024), then launch the kernel with block size 256, it will fail since reqd_work_group_size means the kernel can only be launched with block size 1024. I don't think that matches what launch_bounds(1024) intends to be. It intends to allow the kernel to be launched with block size between 1 and 1024. |
Oh, apologies, I probably should have clarified that we're only going to see this in SPIR-V, as part of run-time finalisation/JIT it gets translated back into the original amdgpu attribute; it's mostly for the convenience of carrying the maximum / composing with existing tools that an existing attribute is chosen, otherwise I'd have had to side-channel it. I agree that it is not a direct match, but sadly there is no direct match, as |
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just wrong. These are not equivalent or translatable between each other.
The flat work group size is 1-dimensional only, and a range of permissible values. reqd_work_group_size is an exact match for all 3 dimensions. The backend already can directly consume reqd_work_group_size
I see. Basically we redefined the semantic of reqd_work_group_size for HIP-generated SPIRV. Do we have a way to differentiate OpenCL-generated and HIP-generated SPIRV? They will be translated differently about reqd_work_group_size |
There shouldn't be one. Representation needs to be consistent and language / producer / consumer independent |
…le_flat_work_group_size_amdgcnspirv
…better and doesn't clash with OCL uses.
| llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1)), | ||
| llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1))}; | ||
|
|
||
| F->setMetadata("max_work_group_size", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why metadata? I know the OpenCL stuff uses metadata, but I think that's because it predates arbitrary string attributes. This is also "setTargetAttributes".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because the Translator and other tools interact/expect it as metadata.
| if (!M.getLangOpts().HIP || | ||
| M.getTarget().getTriple().getVendor() != llvm::Triple::AMD) | ||
| return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove the vendor check. The language check is also suspect, this is interpretation of a target attribute
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not super suspect in context as at the moment this is only for AMDGCN flavoured SPIR-V, which we only support in HIP, and the mapping from flat to dim X only makes sense there.
| auto F = dyn_cast<llvm::Function>(GV); | ||
| if (!F) | ||
| return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this fail if the FunctionDecl below fails?
| // We encode the maximum flat WG size in the first component of the 3D | ||
| // max_work_group_size attribute, which will get reverse translated into the | ||
| // original AMDGPU attribute when targeting AMDGPU. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm still confused about why this is supposed to be "OK"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is OK in HIP because the language (only) defines __launch_bounds__, which is 1D, and we implement with the AMDGPU attribute. At the same time, the SPIR-V attribute cannot be produced via other defined means in HIP (there's no Clang __attribute__ for it, for example, so the user couldn't have written some N-dimensional max_work_group_size themselves), so its presence in AMDGCN flavoured SPIR-V is fairly unambiguously originating from here.
In general we will eventually replace this with processing for all AMDGPU attributes, but that has some challenges in that it'd be more infectious in the translator (or the BE and any eventual SPIR-V consumer, if they were to manifest). Conversely we cannot just drop the original attribute on the floor as correctness depends on it. Hence the PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That reminds me, the way launch_bounds is implemented is also problematic: #91468
Launch bounds should not be implemented in terms of the flat workgroup size attribute in a header. clang should be directly interpreting it. So yes, there is a clang attribute for it. It's just HIP has a hackier implementation of it than the CUDA attribute which we just ignore
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are talking across eachother. I am saying that the SPIR-V attribute cannot be generated via Clang, i.e. that you cannot write __attribute__((foo)) in your source and obtain max_work_group_size metadata, at the moment. Furthermore, from the implementation of Clang's __launch_bounds__:
// An AST node is created for this attribute, but is not used by other parts
// of the compiler. However, this node needs to exist in the AST because
// non-LLVM backends may be relying on the attribute's presence.So this is a glorified annotation / we'd still have to decide on how to lower it into IR, which would likely end up atop flat workgroup size, unless we choose to spam yet another attribute. We also use flat workgroup size implicitly to control / implement --gpu-max-threads-per-block, which is important for correctness, and is in a fairly similar place with __launch_bounds__ (it's always 1D, doesn't have a minimum etc.). It's also not handled by this patch, so I'll have to add it:)
That being said, the idea in #91468 is sound, but it will require a bit of work to get done; I think we'd still have to choose a way to pass the info through SPIR-V (what this PR tries to do).
yxsamliu
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…le_flat_work_group_size_amdgcnspirv
…le_flat_work_group_size_amdgcnspirv
amdgpu_flat_work_group_size into reqd_work_group_size.amdgpu_flat_work_group_size into max_work_group_size.
…`max_work_group_size`. (llvm#116820) HIPAMD relies on the `amdgpu_flat_work_group_size` attribute to implement key functionality such as the `__launch_bounds__` `__global__` function annotation. This attribute is not available / directly translatable to SPIR-V, hence as it is AMDGCN flavoured SPIR-V suffers from information loss. This patch addresses that limitation by converting the unsupported attribute into the `max_work_group_size` attribute which maps to [`MaxWorkgroupSizeINTEL`](https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_kernel_attributes.asciidoc), which is available in / handled by SPIR-V. When reverse translating from SPIR-V to AMDGCN LLVMIR we invert the map and add the original AMDGPU attribute. Change-Id: I5d95cd17d7169a61dc26fb410a838263e4497374
HIPAMD relies on the
amdgpu_flat_work_group_sizeattribute to implement key functionality such as the__launch_bounds____global__function annotation. This attribute is not available / directly translatable to SPIR-V, hence as it is AMDGCN flavoured SPIR-V suffers from information loss.This patch addresses that limitation by converting the unsupported attribute into the
max_work_group_sizeattribute which maps toMaxWorkgroupSizeINTEL, which is available in / handled by SPIR-V. When reverse translating from SPIR-V to AMDGCN LLVMIR we invert the map and add the original AMDGPU attribute.