Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -576,8 +576,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Get the HSA kernel object representing the kernel function.
uint64_t getKernelObject() const { return KernelObject; }

/// Get the size of implicitargs based on the code object version
/// @return 56 for cov4 and 256 for cov5
/// Get the size of implicitargs based on the code object version.
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }

/// Indicates whether or not we need to set up our own private segment size.
Expand Down Expand Up @@ -3386,20 +3385,17 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
return Err;

// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
if (ImplArgs &&
getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
ImplArgs->BlockCountX = NumBlocks[0];
ImplArgs->BlockCountY = NumBlocks[1];
ImplArgs->BlockCountZ = NumBlocks[2];
ImplArgs->GroupSizeX = NumThreads[0];
ImplArgs->GroupSizeY = NumThreads[1];
ImplArgs->GroupSizeZ = NumThreads[2];
ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
? 3
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
}
// Set the COV5+ implicit arguments to the appropriate values.
ImplArgs->BlockCountX = NumBlocks[0];
ImplArgs->BlockCountY = NumBlocks[1];
ImplArgs->BlockCountZ = NumBlocks[2];
ImplArgs->GroupSizeX = NumThreads[0];
ImplArgs->GroupSizeY = NumThreads[1];
ImplArgs->GroupSizeZ = NumThreads[2];
ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
? 3
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;

// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
Expand Down
9 changes: 1 addition & 8 deletions offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,10 @@ struct AMDGPUImplicitArgsTy {
uint8_t Unused2[132]; // 132 byte offset.
};

// Dummy struct for COV4 implicitargs.
struct AMDGPUImplicitArgsTyCOV4 {
uint8_t Unused[56];
};

/// Returns the size in bytes of the implicit arguments of AMDGPU kernels.
/// `Version` is the ELF ABI version, e.g. COV5.
inline uint32_t getImplicitArgsSize(uint16_t Version) {
return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5
? sizeof(AMDGPUImplicitArgsTyCOV4)
: sizeof(AMDGPUImplicitArgsTy);
return sizeof(AMDGPUImplicitArgsTy);
}

/// Reads the AMDGPU specific metadata from the ELF file and propagates the
Expand Down
5 changes: 2 additions & 3 deletions offload/plugins-nextgen/common/src/Utils/ELF.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,9 @@ checkMachineImpl(const object::ELFObjectFile<ELFT> &ELFObj, uint16_t EMachine) {
if (Header.e_machine == EM_AMDGPU) {
if (Header.e_ident[EI_OSABI] != ELFOSABI_AMDGPU_HSA)
return createError("Invalid AMD OS/ABI, must be AMDGPU_HSA");
if (Header.e_ident[EI_ABIVERSION] != ELFABIVERSION_AMDGPU_HSA_V4 &&
Header.e_ident[EI_ABIVERSION] != ELFABIVERSION_AMDGPU_HSA_V5 &&
if (Header.e_ident[EI_ABIVERSION] != ELFABIVERSION_AMDGPU_HSA_V5 &&
Header.e_ident[EI_ABIVERSION] != ELFABIVERSION_AMDGPU_HSA_V6)
return createError("Invalid AMD ABI version, must be version 4 or above");
return createError("Invalid AMD ABI version, must be version 5 or above");
if ((Header.e_flags & EF_AMDGPU_MACH) < EF_AMDGPU_MACH_AMDGCN_GFX700 ||
(Header.e_flags & EF_AMDGPU_MACH) >
EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC)
Expand Down