diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index a7ebbf3bb4dea..5b83ea428c0bf 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1645,9 +1645,9 @@ The AMDGPU backend supports the following LLVM IR attributes. reduced by heuristics. "amdgpu-max-num-workgroups"="x,y,z" Specify the maximum number of work groups for the kernel dispatch in the - X, Y, and Z dimensions. Generated by the ``amdgpu_max_num_work_groups`` - CLANG attribute [CLANG-ATTR]_. Clang only emits this attribute when all - the three numbers are >= 1. + X, Y, and Z dimensions. Each number must be >= 1. Generated by the + ``amdgpu_max_num_work_groups`` CLANG attribute [CLANG-ATTR]_. Clang only + emits this attribute when all the three numbers are >= 1. "amdgpu-no-agpr" Indicates the function will not require allocating AGPRs. This is only relevant on subtargets with AGPRs. The behavior is undefined if a diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index bd418efcb83cb..ee8a700f988dc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -504,14 +504,21 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, Kern[".max_flat_workgroup_size"] = Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize()); - unsigned NumWGX = MFI.getMaxNumWorkGroupsX(); - unsigned NumWGY = MFI.getMaxNumWorkGroupsY(); - unsigned NumWGZ = MFI.getMaxNumWorkGroupsZ(); - if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) { + + uint32_t NumWGY = MFI.getMaxNumWorkGroupsY(); + uint32_t NumWGZ = MFI.getMaxNumWorkGroupsZ(); + uint32_t NumWGX = MFI.getMaxNumWorkGroupsX(); + + // TODO: Should consider 0 invalid and reject in IR verifier. + if (NumWGX != std::numeric_limits::max() && NumWGX != 0) Kern[".max_num_workgroups_x"] = Kern.getDocument()->getNode(NumWGX); + + if (NumWGY != std::numeric_limits::max() && NumWGY != 0) Kern[".max_num_workgroups_y"] = Kern.getDocument()->getNode(NumWGY); + + if (NumWGZ != std::numeric_limits::max() && NumWGZ != 0) Kern[".max_num_workgroups_z"] = Kern.getDocument()->getNode(NumWGZ); - } + Kern[".sgpr_spill_count"] = Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs()); Kern[".vgpr_spill_count"] = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 961a9220b48d6..54b17ca2cffb1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -371,5 +371,6 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct SmallVector AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { - return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3); + return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3, + std::numeric_limits::max()); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 20a81a3135f0b..c167e27ab07a5 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1307,15 +1307,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name, } SmallVector getIntegerVecAttribute(const Function &F, StringRef Name, - unsigned Size) { + unsigned Size, + unsigned DefaultVal) { assert(Size > 2); - SmallVector Default(Size, 0); + SmallVector Default(Size, DefaultVal); Attribute A = F.getFnAttribute(Name); if (!A.isStringAttribute()) return Default; - SmallVector Vals(Size, 0); + SmallVector Vals(Size, DefaultVal); LLVMContext &Ctx = F.getContext(); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index d1d84394cc070..beebe320b2cf3 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -919,7 +919,8 @@ getIntegerPairAttribute(const Function &F, StringRef Name, /// /// \returns false if any error occurs. SmallVector getIntegerVecAttribute(const Function &F, StringRef Name, - unsigned Size); + unsigned Size, + unsigned DefaultVal = 0); /// Represents the counter values to wait for in an s_waitcnt instruction. /// diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups.ll similarity index 59% rename from llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups.ll rename to llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups.ll index bc58222076ac0..ffbe2ec5f173e 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups.ll @@ -46,6 +46,32 @@ entry: attributes #4 = {"amdgpu-max-num-workgroups"="1024,1024,1024"} + +; Ignore if number of work groups for x dimension is 0. +; CHECK-LABEL: {{^}}empty_max_num_workgroups_x_max: +define amdgpu_kernel void @empty_max_num_workgroups_x_max() #5 { +entry: + ret void +} +attributes #5 = {"amdgpu-max-num-workgroups"="4294967295,2,3"} + +; Ignore if number of work groups for y dimension is 0. +; CHECK-LABEL: {{^}}empty_max_num_workgroups_y_max: +define amdgpu_kernel void @empty_max_num_workgroups_y_max() #6 { +entry: + ret void +} +attributes #6 = {"amdgpu-max-num-workgroups"="1,4294967295,3"} + +; Ignore if number of work groups for z dimension is 0. +; CHECK-LABEL: {{^}}empty_max_num_workgroups_z_max: +define amdgpu_kernel void @empty_max_num_workgroups_z_max() #7 { +entry: + ret void +} +attributes #7 = {"amdgpu-max-num-workgroups"="1,2,4294967295"} + + ; CHECK: .amdgpu_metadata ; CHECK: - .args: ; CHECK: .max_flat_workgroup_size: 1024 @@ -54,16 +80,22 @@ attributes #4 = {"amdgpu-max-num-workgroups"="1024,1024,1024"} ; CHECK: - .args: ; CHECK: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .max_num_workgroups_y: 2 +; CHECK-NEXT: .max_num_workgroups_z: 3 ; CHECK-NEXT: .name: empty_max_num_workgroups_x0 ; CHECK-NEXT: .private_segment_fixed_size: 0 ; CHECK: - .args: ; CHECK: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .max_num_workgroups_x: 1 +; CHECK-NEXT: .max_num_workgroups_z: 3 ; CHECK-NEXT: .name: empty_max_num_workgroups_y0 ; CHECK-NEXT: .private_segment_fixed_size: 0 ; CHECK: - .args: ; CHECK: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .max_num_workgroups_x: 1 +; CHECK-NEXT: .max_num_workgroups_y: 2 ; CHECK-NEXT: .name: empty_max_num_workgroups_z0 ; CHECK-NEXT: .private_segment_fixed_size: 0 @@ -82,3 +114,25 @@ attributes #4 = {"amdgpu-max-num-workgroups"="1024,1024,1024"} ; CHECK-NEXT: .max_num_workgroups_z: 1024 ; CHECK-NEXT: .name: empty_max_num_workgroups_1024_1024_1024 ; CHECK-NEXT: .private_segment_fixed_size: 0 + + +; CHECK: - .args: +; CHECK: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .max_num_workgroups_y: 2 +; CHECK-NEXT: .max_num_workgroups_z: 3 +; CHECK-NEXT: .name: empty_max_num_workgroups_x_max +; CHECK-NEXT: .private_segment_fixed_size: 0 + +; CHECK: - .args: +; CHECK: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .max_num_workgroups_x: 1 +; CHECK-NEXT: .max_num_workgroups_z: 3 +; CHECK-NEXT: .name: empty_max_num_workgroups_y_max +; CHECK-NEXT: .private_segment_fixed_size: 0 + +; CHECK: - .args: +; CHECK: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .max_num_workgroups_x: 1 +; CHECK-NEXT: .max_num_workgroups_y: 2 +; CHECK-NEXT: .name: empty_max_num_workgroups_z_max +; CHECK-NEXT: .private_segment_fixed_size: 0