diff --git a/llvm/include/llvm/Support/AMDGPUMetadata.h b/llvm/include/llvm/Support/AMDGPUMetadata.h index 990c825ae6875..2ce8e95836550 100644 --- a/llvm/include/llvm/Support/AMDGPUMetadata.h +++ b/llvm/include/llvm/Support/AMDGPUMetadata.h @@ -48,7 +48,7 @@ constexpr uint32_t VersionMinorV5 = 2; /// HSA metadata major version for code object V6. constexpr uint32_t VersionMajorV6 = 1; /// HSA metadata minor version for code object V6. -constexpr uint32_t VersionMinorV6 = 2; +constexpr uint32_t VersionMinorV6 = 3; /// Old HSA metadata beginning assembler directive for V2. This is only used for /// diagnostics now. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index d158f0f58d711..6bab8e6da8545 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -16,12 +16,15 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; +using namespace llvm::KernArgPreload; #define DEBUG_TYPE "amdgpu-argument-reg-usage-info" INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE, "Argument Register Usage Information Storage", false, true) +constexpr HiddenArgInfo HiddenArgUtils::HiddenArgs[END_HIDDEN_ARGS]; + void ArgDescriptor::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const { if (!isSet()) { @@ -176,6 +179,31 @@ AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { return AI; } +SmallVector +AMDGPUFunctionArgInfo::getPreloadDescriptorsForArgIdx(unsigned ArgIdx) const { + SmallVector Results; + for (unsigned PartIdx = 0; PartIdx < PreloadKernArgs.size(); ++PartIdx) { + const auto &Desc = PreloadKernArgs[PartIdx]; + if (Desc.OrigArgIdx == ArgIdx) + Results.push_back(&Desc); + } + + return Results; +} + +const KernArgPreloadDescriptor * +AMDGPUFunctionArgInfo::getHiddenArgPreloadDescriptor(HiddenArg HA) const { + assert(HA < END_HIDDEN_ARGS); + + auto HiddenArgIt = PreloadHiddenArgsIndexMap.find(HA); + if (HiddenArgIt == PreloadHiddenArgsIndexMap.end()) + return nullptr; + + const KernArgPreloadDescriptor &Desc = PreloadKernArgs[HiddenArgIt->second]; + assert(Desc.IsValid && "Hidden argument preload descriptor not valid."); + return &Desc; +} + const AMDGPUFunctionArgInfo & AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const { auto I = ArgInfoMap.find(&F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index e07d47381ecca..f672c6edc9739 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -11,7 +11,11 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/IndexedMap.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Register.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Type.h" #include "llvm/Pass.h" namespace llvm { @@ -95,11 +99,81 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { return OS; } -struct KernArgPreloadDescriptor : public ArgDescriptor { - KernArgPreloadDescriptor() {} - SmallVector Regs; +namespace KernArgPreload { + +enum HiddenArg : unsigned { + HIDDEN_BLOCK_COUNT_X = 0, + HIDDEN_BLOCK_COUNT_Y = 1, + HIDDEN_BLOCK_COUNT_Z = 2, + HIDDEN_GROUP_SIZE_X = 3, + HIDDEN_GROUP_SIZE_Y = 4, + HIDDEN_GROUP_SIZE_Z = 5, + HIDDEN_REMAINDER_X = 6, + HIDDEN_REMAINDER_Y = 7, + HIDDEN_REMAINDER_Z = 8, + END_HIDDEN_ARGS = HIDDEN_REMAINDER_Z + 1 }; +// Stores information about a specific hidden argument. +struct HiddenArgInfo { + // Offset in bytes from the location in the kernearg segment pointed to by + // the implicitarg pointer. + uint8_t Offset = 0; + // The size of the hidden argument in bytes. + uint8_t Size = 0; + // The name of the hidden argument in the kernel signature. + const char *Name = nullptr; +}; + +struct HiddenArgUtils { + static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { + {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, + {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, + {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, + {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, + {22, 2, "_hidden_remainder_z"}}; + + static HiddenArg getHiddenArgFromOffset(unsigned Offset) { + for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) { + if (HiddenArgs[I].Offset == Offset) + return static_cast(I); + } + + return END_HIDDEN_ARGS; + } + + static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) + return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); + + llvm_unreachable("unexpected hidden argument"); + } + + static const char *getHiddenArgName(HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) + return HiddenArgs[HA].Name; + + llvm_unreachable("unexpected hidden argument"); + } +}; + +struct KernArgPreloadDescriptor { + // Id of the original argument in the IR kernel function argument list. + unsigned OrigArgIdx = 0; + + // If this IR argument was split into multiple parts, this is the index of the + // part in the original argument. + unsigned PartIdx = 0; + + // The registers that the argument is preloaded into. The argument may be + // split across multiple registers. + SmallVector Regs; + + bool IsValid = false; +}; + +} // namespace KernArgPreload + struct AMDGPUFunctionArgInfo { // clang-format off enum PreloadedValue { @@ -160,8 +234,17 @@ struct AMDGPUFunctionArgInfo { ArgDescriptor WorkItemIDY; ArgDescriptor WorkItemIDZ; + struct PreloadArgIndexFunctor { + using argument_type = unsigned; + unsigned operator()(unsigned Idx) const { return Idx; } + }; + // Map the index of preloaded kernel arguments to its descriptor. - SmallDenseMap PreloadKernArgs{}; + IndexedMap + PreloadKernArgs; + + // Map hidden argument to the index of it's descriptor. + SmallDenseMap PreloadHiddenArgsIndexMap{}; // The first user SGPR allocated for kernarg preloading. Register FirstKernArgPreloadReg; @@ -169,6 +252,16 @@ struct AMDGPUFunctionArgInfo { getPreloadedValue(PreloadedValue Value) const; static AMDGPUFunctionArgInfo fixedABILayout(); + + // Returns preload argument descriptors for an IR argument index. Isel may + // split IR arguments into multiple parts, the return vector holds all parts + // associated with an IR argument in the kernel signature. + SmallVector + getPreloadDescriptorsForArgIdx(unsigned ArgIdx) const; + + // Returns the hidden arguments `KernArgPreloadDescriptor` if it is preloaded. + const KernArgPreload::KernArgPreloadDescriptor * + getHiddenArgPreloadDescriptor(KernArgPreload::HiddenArg HA) const; }; class AMDGPUArgumentUsageInfo : public ImmutablePass { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 19b8757e6ad6e..97c15f145dec8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -15,6 +15,7 @@ #include "AMDGPUHSAMetadataStreamer.h" #include "AMDGPU.h" #include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIMachineFunctionInfo.h" #include "SIProgramInfo.h" @@ -290,7 +291,7 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF, if (Arg.hasAttribute("amdgpu-hidden-argument")) continue; - emitKernelArg(Arg, Offset, Args); + emitKernelArg(Arg, Offset, Args, MF); } emitHiddenKernelArgs(MF, Offset, Args); @@ -300,7 +301,14 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF, void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg, unsigned &Offset, - msgpack::ArrayDocNode Args) { + msgpack::ArrayDocNode Args, + const MachineFunction &MF) { + emitKernelArgCommon(Arg, Offset, Args, MF); +} + +void MetadataStreamerMsgPackV4::emitKernelArgCommon( + const Argument &Arg, unsigned &Offset, msgpack::ArrayDocNode Args, + const MachineFunction &MF, StringRef PreloadRegisters) { const auto *Func = Arg.getParent(); auto ArgNo = Arg.getArgNo(); const MDNode *Node; @@ -357,17 +365,18 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg, Align ArgAlign; std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); - emitKernelArg(DL, ArgTy, ArgAlign, - getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args, - PointeeAlign, Name, TypeName, BaseTypeName, ActAccQual, - AccQual, TypeQual); + emitKernelArgImpl(DL, ArgTy, ArgAlign, + getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args, + PreloadRegisters, PointeeAlign, Name, TypeName, + BaseTypeName, ActAccQual, AccQual, TypeQual); } -void MetadataStreamerMsgPackV4::emitKernelArg( +void MetadataStreamerMsgPackV4::emitKernelArgImpl( const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind, - unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign, - StringRef Name, StringRef TypeName, StringRef BaseTypeName, - StringRef ActAccQual, StringRef AccQual, StringRef TypeQual) { + unsigned &Offset, msgpack::ArrayDocNode Args, StringRef PreloadRegisters, + MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName, + StringRef BaseTypeName, StringRef ActAccQual, StringRef AccQual, + StringRef TypeQual) { auto Arg = Args.getDocument()->getMapNode(); if (!Name.empty()) @@ -409,6 +418,11 @@ void MetadataStreamerMsgPackV4::emitKernelArg( Arg[".is_pipe"] = Arg.getDocument()->getNode(true); } + if (!PreloadRegisters.empty()) { + Arg[".preload_registers"] = + Arg.getDocument()->getNode(PreloadRegisters, /*Copy=*/true); + } + Args.push_back(Arg); } @@ -428,14 +442,14 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs( Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr()); if (HiddenArgNumBytes >= 8) - emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, - Args); + emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, + Args); if (HiddenArgNumBytes >= 16) - emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, - Args); + emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, + Args); if (HiddenArgNumBytes >= 24) - emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, - Args); + emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, + Args); auto *Int8PtrTy = PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); @@ -445,42 +459,42 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs( // before code object V5, which makes the mutual exclusion between the // "printf buffer" and "hostcall buffer" here sound. if (M->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, + Args); else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", + Offset, Args); else - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); } // Emit "default queue" and "completion action" arguments if enqueue kernel is // used, otherwise emit dummy "none" arguments. if (HiddenArgNumBytes >= 40) { if (!Func.hasFnAttribute("amdgpu-no-default-queue")) { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset, + Args); } else { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); } } if (HiddenArgNumBytes >= 48) { if (!Func.hasFnAttribute("amdgpu-no-completion-action")) { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_completion_action", + Offset, Args); } else { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); } } // Emit the pointer argument for multi-grid object. if (HiddenArgNumBytes >= 56) { if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", + Offset, Args); } else { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); } } } @@ -617,6 +631,13 @@ void MetadataStreamerMsgPackV5::emitVersion() { getRootMetadata("amdhsa.version") = Version; } +void MetadataStreamerMsgPackV5::emitHiddenKernelArg( + const DataLayout &DL, Type *ArgTy, Align Alignment, StringRef ArgName, + unsigned &Offset, msgpack::ArrayDocNode Args, + KernArgPreload::HiddenArg HiddenArg, const AMDGPUFunctionArgInfo *ArgInfo) { + emitKernelArgImpl(DL, ArgTy, Alignment, ArgName, Offset, Args); +} + void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) { auto &Func = MF.getFunction(); @@ -635,77 +656,90 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( auto *Int16Ty = Type::getInt16Ty(Func.getContext()); Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr()); - emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args); - emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args); - emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args); - - emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args); - emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args); - emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args); - - emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args); - emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args); - emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args); + const AMDGPUFunctionArgInfo &ArgInfo = MFI.getArgInfo(); + emitHiddenKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, + Args, KernArgPreload::HIDDEN_BLOCK_COUNT_X, &ArgInfo); + emitHiddenKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, + Args, KernArgPreload::HIDDEN_BLOCK_COUNT_Y, &ArgInfo); + emitHiddenKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, + Args, KernArgPreload::HIDDEN_BLOCK_COUNT_Z, &ArgInfo); + + emitHiddenKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, + Args, KernArgPreload::HIDDEN_GROUP_SIZE_X, &ArgInfo); + emitHiddenKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, + Args, KernArgPreload::HIDDEN_GROUP_SIZE_Y, &ArgInfo); + emitHiddenKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, + Args, KernArgPreload::HIDDEN_GROUP_SIZE_Z, &ArgInfo); + + emitHiddenKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args, + KernArgPreload::HIDDEN_REMAINDER_X, &ArgInfo); + emitHiddenKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args, + KernArgPreload::HIDDEN_REMAINDER_Y, &ArgInfo); + emitHiddenKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args, + KernArgPreload::HIDDEN_REMAINDER_Z, &ArgInfo); // Reserved for hidden_tool_correlation_id. Offset += 8; Offset += 8; // Reserved. - emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args); - emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, Args); - emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, Args); + emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, + Args); + emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, + Args); + emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, + Args); - emitKernelArg(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args); + emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args); Offset += 6; // Reserved. auto *Int8PtrTy = PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); if (M->getNamedMetadata("llvm.printf.fmts")) { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, + Args); } else { Offset += 8; // Skipped. } if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset, + Args); } else { Offset += 8; // Skipped. } if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", + Offset, Args); } else { Offset += 8; // Skipped. } if (!Func.hasFnAttribute("amdgpu-no-heap-ptr")) - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args); else Offset += 8; // Skipped. if (!Func.hasFnAttribute("amdgpu-no-default-queue")) { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset, + Args); } else { Offset += 8; // Skipped. } if (!Func.hasFnAttribute("amdgpu-no-completion-action")) { - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, - Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_completion_action", + Offset, Args); } else { Offset += 8; // Skipped. } // Emit argument for hidden dynamic lds size if (MFI.isDynamicLDSUsed()) { - emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset, - Args); + emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset, + Args); } else { Offset += 4; // skipped } @@ -715,14 +749,17 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( // hidden_private_base and hidden_shared_base are only when the subtarget has // ApertureRegs. if (!ST.hasApertureRegs()) { - emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args); - emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args); + emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_private_base", Offset, + Args); + emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, + Args); } else { Offset += 8; // Skipped. } if (MFI.getUserSGPRInfo().hasQueuePtr()) - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args); + emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, + Args); } void MetadataStreamerMsgPackV5::emitKernelAttrs(const AMDGPUTargetMachine &TM, @@ -745,5 +782,51 @@ void MetadataStreamerMsgPackV6::emitVersion() { getRootMetadata("amdhsa.version") = Version; } +void MetadataStreamerMsgPackV6::emitHiddenKernelArg( + const DataLayout &DL, Type *ArgTy, Align Alignment, StringRef ArgName, + unsigned &Offset, msgpack::ArrayDocNode Args, + KernArgPreload::HiddenArg HiddenArg, const AMDGPUFunctionArgInfo *ArgInfo) { + assert(ArgInfo && HiddenArg != KernArgPreload::END_HIDDEN_ARGS); + + SmallString<32> PreloadStr; + const KernArgPreload::KernArgPreloadDescriptor *PreloadDesc = + ArgInfo->getHiddenArgPreloadDescriptor(HiddenArg); + if (PreloadDesc) { + const SmallVectorImpl &Regs = PreloadDesc->Regs; + for (const auto Reg : Regs) { + if (!PreloadStr.empty()) + PreloadStr.push_back(' '); + PreloadStr += AMDGPUInstPrinter::getRegisterName(Reg); + } + } + emitKernelArgImpl(DL, ArgTy, Alignment, ArgName, Offset, Args, PreloadStr); +} + +void MetadataStreamerMsgPackV6::emitKernelArg(const Argument &Arg, + unsigned &Offset, + msgpack::ArrayDocNode Args, + const MachineFunction &MF) { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + SmallString<32> PreloadRegisters; + if (MFI->getNumKernargPreloadedSGPRs()) { + assert(MF.getSubtarget().hasKernargPreload()); + const SmallVectorImpl + &PreloadDescs = + MFI->getArgInfo().getPreloadDescriptorsForArgIdx(Arg.getArgNo()); + for (auto &Desc : PreloadDescs) { + if (!PreloadRegisters.empty()) + PreloadRegisters.push_back(' '); + + for (const auto Reg : Desc->Regs) { + if (!PreloadRegisters.empty()) + PreloadRegisters.push_back(' '); + PreloadRegisters += AMDGPUInstPrinter::getRegisterName(Reg); + } + } + } + + emitKernelArgCommon(Arg, Offset, Args, MF, PreloadRegisters); +} + } // end namespace AMDGPU::HSAMD } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 22dfcb4a4ec1d..0515482790b6b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H +#include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUDelayedMCExpr.h" #include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/Support/AMDGPUMetadata.h" @@ -60,6 +61,9 @@ class MetadataStreamer { virtual void emitVersion() = 0; virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) = 0; + virtual void emitKernelArg(const Argument &Arg, unsigned &Offset, + msgpack::ArrayDocNode Args, + const MachineFunction &MF) = 0; virtual void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func, msgpack::MapDocNode Kern) = 0; @@ -108,15 +112,22 @@ class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4 void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern); void emitKernelArg(const Argument &Arg, unsigned &Offset, - msgpack::ArrayDocNode Args); - - void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment, - StringRef ValueKind, unsigned &Offset, msgpack::ArrayDocNode Args, - MaybeAlign PointeeAlign = std::nullopt, - StringRef Name = "", StringRef TypeName = "", - StringRef BaseTypeName = "", StringRef ActAccQual = "", - StringRef AccQual = "", StringRef TypeQual = ""); + const MachineFunction &MF) override; + + void emitKernelArgCommon(const Argument &Arg, unsigned &Offset, + msgpack::ArrayDocNode Args, + const MachineFunction &MF, + StringRef PreloadRegisters = {}); + + void emitKernelArgImpl(const DataLayout &DL, Type *Ty, Align Alignment, + StringRef ValueKind, unsigned &Offset, + msgpack::ArrayDocNode Args, + StringRef PreloadRegisters = "", + MaybeAlign PointeeAlign = std::nullopt, + StringRef Name = "", StringRef TypeName = "", + StringRef BaseTypeName = "", StringRef ActAccQual = "", + StringRef AccQual = "", StringRef TypeQual = ""); void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) override; @@ -151,6 +162,11 @@ class MetadataStreamerMsgPackV5 : public MetadataStreamerMsgPackV4 { msgpack::ArrayDocNode Args) override; void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func, msgpack::MapDocNode Kern) override; + virtual void emitHiddenKernelArg( + const DataLayout &DL, Type *ArgTy, Align Alignment, StringRef ArgName, + unsigned &Offset, msgpack::ArrayDocNode Args, + KernArgPreload::HiddenArg HiddenArg = KernArgPreload::END_HIDDEN_ARGS, + const AMDGPUFunctionArgInfo *ArgInfo = nullptr); public: MetadataStreamerMsgPackV5() = default; @@ -160,6 +176,14 @@ class MetadataStreamerMsgPackV5 : public MetadataStreamerMsgPackV4 { class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 { protected: void emitVersion() override; + void emitKernelArg(const Argument &Arg, unsigned &Offset, + msgpack::ArrayDocNode Args, + const MachineFunction &MF) override; + void emitHiddenKernelArg( + const DataLayout &DL, Type *ArgTy, Align Alignment, StringRef ArgName, + unsigned &Offset, msgpack::ArrayDocNode Args, + KernArgPreload::HiddenArg HiddenArg = KernArgPreload::END_HIDDEN_ARGS, + const AMDGPUFunctionArgInfo *ArgInfo = nullptr) override; public: MetadataStreamerMsgPackV6() = default; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index dec781d71c54e..5df85a8803821 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -24,6 +24,7 @@ #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" using namespace llvm; +using namespace llvm::KernArgPreload; namespace { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp index 984c1ee89309e..46af53c2ff0d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp @@ -19,6 +19,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUTargetMachine.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" @@ -32,6 +33,7 @@ #define DEBUG_TYPE "amdgpu-preload-kernel-arguments" using namespace llvm; +using namespace llvm::KernArgPreload; static cl::opt KernargPreloadCount( "amdgpu-kernarg-preload-count", @@ -60,59 +62,6 @@ class PreloadKernelArgInfo { const GCNSubtarget &ST; unsigned NumFreeUserSGPRs; - enum HiddenArg : unsigned { - HIDDEN_BLOCK_COUNT_X, - HIDDEN_BLOCK_COUNT_Y, - HIDDEN_BLOCK_COUNT_Z, - HIDDEN_GROUP_SIZE_X, - HIDDEN_GROUP_SIZE_Y, - HIDDEN_GROUP_SIZE_Z, - HIDDEN_REMAINDER_X, - HIDDEN_REMAINDER_Y, - HIDDEN_REMAINDER_Z, - END_HIDDEN_ARGS - }; - - // Stores information about a specific hidden argument. - struct HiddenArgInfo { - // Offset in bytes from the location in the kernearg segment pointed to by - // the implicitarg pointer. - uint8_t Offset; - // The size of the hidden argument in bytes. - uint8_t Size; - // The name of the hidden argument in the kernel signature. - const char *Name; - }; - - static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { - {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, - {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, - {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, - {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, - {22, 2, "_hidden_remainder_z"}}; - - static HiddenArg getHiddenArgFromOffset(unsigned Offset) { - for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) - if (HiddenArgs[I].Offset == Offset) - return static_cast(I); - - return END_HIDDEN_ARGS; - } - - static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { - if (HA < END_HIDDEN_ARGS) - return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); - - llvm_unreachable("Unexpected hidden argument."); - } - - static const char *getHiddenArgName(HiddenArg HA) { - if (HA < END_HIDDEN_ARGS) - return HiddenArgs[HA].Name; - - llvm_unreachable("Unexpected hidden argument."); - } - // Clones the function after adding implicit arguments to the argument list // and returns the new updated function. Preloaded implicit arguments are // added up to and including the last one that will be preloaded, indicated by @@ -125,7 +74,7 @@ class PreloadKernelArgInfo { LLVMContext &Ctx = F.getParent()->getContext(); SmallVector FTypes(FT->param_begin(), FT->param_end()); for (unsigned I = 0; I <= LastPreloadIndex; ++I) - FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); + FTypes.push_back(HiddenArgUtils::getHiddenArgType(Ctx, HiddenArg(I))); FunctionType *NFT = FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); @@ -152,7 +101,7 @@ class PreloadKernelArgInfo { AttributeList AL = NF->getAttributes(); for (unsigned I = 0; I <= LastPreloadIndex; ++I) { AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); - NFArg++->setName(getHiddenArgName(HiddenArg(I))); + NFArg++->setName(HiddenArgUtils::getHiddenArgName(HiddenArg(I))); } NF->setAttributes(AL); @@ -210,8 +159,9 @@ class PreloadKernelArgInfo { // FIXME: Expand handle merged loads. LLVMContext &Ctx = F.getParent()->getContext(); Type *LoadTy = Load->getType(); - HiddenArg HA = getHiddenArgFromOffset(Offset); - if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) + HiddenArg HA = HiddenArgUtils::getHiddenArgFromOffset(Offset); + if (HA == END_HIDDEN_ARGS || + LoadTy != HiddenArgUtils::getHiddenArgType(Ctx, HA)) continue; ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); @@ -242,14 +192,16 @@ class PreloadKernelArgInfo { if (PreloadEnd == ImplicitArgLoads.begin()) return; - unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); + unsigned LastHiddenArgIndex = + HiddenArgUtils::getHiddenArgFromOffset(PreloadEnd[-1].second); Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); assert(NF); FunctionsToErase.push_back(&F); for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { LoadInst *LoadInst = I->first; unsigned LoadOffset = I->second; - unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); + unsigned HiddenArgIndex = + HiddenArgUtils::getHiddenArgFromOffset(LoadOffset); unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; Argument *Arg = NF->getArg(Index); LoadInst->replaceAllUsesWith(Arg); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4d67e4a5cbcf9..6ba57565d6a42 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -48,6 +48,7 @@ using namespace llvm; using namespace llvm::SDPatternMatch; +using namespace llvm::KernArgPreload; #define DEBUG_TYPE "si-lower" @@ -2680,6 +2681,18 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, // these from the dispatch pointer. } +// Maps a hidden kernel argument to its preload index in +// PreloadHiddenArgsIndexMap. +static void mapHiddenArgToPreloadIndex(AMDGPUFunctionArgInfo &ArgInfo, + unsigned ArgOffset, + unsigned ImplicitArgOffset, + unsigned ArgIdx) { + auto [It, Inserted] = ArgInfo.PreloadHiddenArgsIndexMap.try_emplace( + HiddenArgUtils::getHiddenArgFromOffset(ArgOffset - ImplicitArgOffset)); + assert(Inserted && "Preload hidden kernel argument allocated twice."); + It->second = ArgIdx; +} + // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be // sequential starting from the first argument. void SITargetLowering::allocatePreloadKernArgSGPRs( @@ -2692,6 +2705,7 @@ void SITargetLowering::allocatePreloadKernArgSGPRs( bool InPreloadSequence = true; unsigned InIdx = 0; bool AlignedForImplictArgs = false; + unsigned ImplicitArgOffsetAdjustment = 0; unsigned ImplicitArgOffset = 0; for (auto &Arg : F.args()) { if (!InPreloadSequence || !Arg.hasInRegAttr()) @@ -2720,18 +2734,35 @@ void SITargetLowering::allocatePreloadKernArgSGPRs( if (!AlignedForImplictArgs) { ImplicitArgOffset = alignTo(LastExplicitArgOffset, - Subtarget->getAlignmentForImplicitArgPtr()) - - LastExplicitArgOffset; + Subtarget->getAlignmentForImplicitArgPtr()); + ImplicitArgOffsetAdjustment = + ImplicitArgOffset - LastExplicitArgOffset; AlignedForImplictArgs = true; } - ArgOffset += ImplicitArgOffset; + ArgOffset += ImplicitArgOffsetAdjustment; } // Arg is preloaded into the previous SGPR. if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { assert(InIdx >= 1 && "No previous SGPR"); - Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( - Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); + auto &PreloadKernArgs = Info.getArgInfo().PreloadKernArgs; + PreloadKernArgs.grow(InIdx); + KernArgPreloadDescriptor &PreloadDesc = PreloadKernArgs[InIdx]; + assert(!PreloadDesc.IsValid && + "Preload kernel argument allocated twice."); + + const KernArgPreloadDescriptor &PrevDesc = PreloadKernArgs[InIdx - 1]; + assert(PrevDesc.IsValid && + "Previous preload kernel argument not allocated."); + PreloadDesc.Regs.push_back(PrevDesc.Regs[0]); + + PreloadDesc.OrigArgIdx = Arg.getArgNo(); + PreloadDesc.PartIdx = InIdx; + PreloadDesc.IsValid = true; + if (Arg.hasAttribute("amdgpu-hidden-argument")) + mapHiddenArgToPreloadIndex(Info.getArgInfo(), ArgOffset, + ImplicitArgOffset, InIdx); + continue; } @@ -2743,11 +2774,15 @@ void SITargetLowering::allocatePreloadKernArgSGPRs( break; } + if (Arg.hasAttribute("amdgpu-hidden-argument")) + mapHiddenArgToPreloadIndex(Info.getArgInfo(), ArgOffset, + ImplicitArgOffset, InIdx); + // Preload this argument. const TargetRegisterClass *RC = TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32); - SmallVectorImpl *PreloadRegs = - Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs); + SmallVectorImpl *PreloadRegs = Info.addPreloadedKernArg( + TRI, RC, NumAllocSGPRs, InIdx, Arg.getArgNo(), PaddingSGPRs); if (PreloadRegs->size() > 1) RC = &AMDGPU::SGPR_32RegClass; @@ -3151,7 +3186,9 @@ SDValue SITargetLowering::LowerFormalArguments( } SDValue NewArg; - if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) { + auto &PreloadKernArgs = Info->getArgInfo().PreloadKernArgs; + if (Arg.isOrigArg() && PreloadKernArgs.inBounds(i) && + PreloadKernArgs[i].IsValid) { if (MemVT.getStoreSize() < 4 && Alignment < 4) { // In this case the argument is packed into the previous preload SGPR. int64_t AlignDownOffset = alignDown(Offset, 4); @@ -3161,8 +3198,7 @@ SDValue SITargetLowering::LowerFormalArguments( const SIMachineFunctionInfo *Info = MF.getInfo(); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - Register Reg = - Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0]; + Register Reg = Info->getArgInfo().PreloadKernArgs[i].Regs[0]; assert(Reg); Register VReg = MRI.getLiveInVirtReg(Reg); @@ -3182,7 +3218,7 @@ SDValue SITargetLowering::LowerFormalArguments( MF.getInfo(); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); const SmallVectorImpl &PreloadRegs = - Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs; + Info->getArgInfo().PreloadKernArgs[i].Regs; SDValue Copy; if (PreloadRegs.size() == 1) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 9a1448f1f95dc..7e1a6e5eab5cf 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -276,9 +276,16 @@ Register SIMachineFunctionInfo::addLDSKernelId() { SmallVectorImpl *SIMachineFunctionInfo::addPreloadedKernArg( const SIRegisterInfo &TRI, const TargetRegisterClass *RC, - unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) { - auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(KernArgIdx); - assert(Inserted && "Preload kernel argument allocated twice."); + unsigned AllocSizeDWord, unsigned PartIdx, unsigned ArgIdx, + unsigned PaddingSGPRs) { + ArgInfo.PreloadKernArgs.grow(PartIdx); + KernArgPreload::KernArgPreloadDescriptor &PreloadDesc = + ArgInfo.PreloadKernArgs[PartIdx]; + assert(!PreloadDesc.IsValid && "Preload kernel argument allocated twice."); + PreloadDesc.PartIdx = PartIdx; + PreloadDesc.OrigArgIdx = ArgIdx; + PreloadDesc.IsValid = true; + NumUserSGPRs += PaddingSGPRs; // If the available register tuples are aligned with the kernarg to be // preloaded use that register, otherwise we need to use a set of SGPRs and @@ -287,7 +294,7 @@ SmallVectorImpl *SIMachineFunctionInfo::addPreloadedKernArg( ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR(); Register PreloadReg = TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC); - auto &Regs = It->second.Regs; + auto &Regs = PreloadDesc.Regs; if (PreloadReg && (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) { Regs.push_back(PreloadReg); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 08b0206d244fb..9fae025d265d3 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -854,8 +854,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, Register addLDSKernelId(); SmallVectorImpl * addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, - unsigned AllocSizeDWord, int KernArgIdx, - int PaddingSGPRs); + unsigned AllocSizeDWord, unsigned PartIdx, + unsigned ArgIdx, unsigned PaddingSGPRs); /// Increment user SGPRs used for padding the argument list only. Register addReservedUserSGPR() { diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-preload-args-v6.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-preload-args-v6.ll new file mode 100644 index 0000000000000..ce038d8c93418 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-preload-args-v6.ll @@ -0,0 +1,460 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=16 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=16 < %s | FileCheck --check-prefix=CHECK %s + +; CHECK: amdhsa.kernels: +; CHECK-NEXT: - .agpr_count: 0 +; CHECK-NEXT: .args: +; CHECK-NEXT: - .name: in +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .preload_registers: s8 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: r +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .preload_registers: 's[10:11]' +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .preload_registers: 's[12:13]' +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: b +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .preload_registers: 's[14:15]' +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .offset: 32 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 36 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 40 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK-NEXT: - .offset: 44 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_x +; CHECK-NEXT: - .offset: 46 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_y +; CHECK-NEXT: - .offset: 48 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_z +; CHECK-NEXT: - .offset: 50 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_x +; CHECK-NEXT: - .offset: 52 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_y +; CHECK-NEXT: - .offset: 54 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_z +; CHECK-NEXT: - .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 88 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .offset: 96 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_grid_dims +; CHECK-NEXT: - .offset: 104 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: - .offset: 112 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: - .offset: 120 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: - .offset: 128 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_heap_v1 +; CHECK-NEXT: - .offset: 136 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_default_queue +; CHECK-NEXT: - .offset: 144 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_completion_action +; CHECK-NEXT: - .offset: 152 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size +; CHECK-NEXT: - .offset: 232 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_queue_ptr +; CHECK-NEXT: .group_segment_fixed_size: 0 +; CHECK-NEXT: .kernarg_segment_align: 8 +; CHECK-NEXT: .kernarg_segment_size: 288 +; CHECK-NEXT: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .name: test_preload_v6 +; CHECK-NEXT: .private_segment_fixed_size: 0 +; CHECK-NEXT: .sgpr_count: 22 +; CHECK-NEXT: .sgpr_spill_count: 0 +; CHECK-NEXT: .symbol: test_preload_v6.kd +; CHECK-NEXT: .uses_dynamic_stack: false +; CHECK-NEXT: .vgpr_count: 3 +; CHECK-NEXT: .vgpr_spill_count: 0 +; CHECK-NEXT: .wavefront_size: 64 +; CHECK-NEXT: - .agpr_count: 0 +; CHECK-NEXT: .args: +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: out +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .preload_registers: 's[2:3]' +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .preload_registers: s4 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 12 +; CHECK-NEXT: .preload_registers: s5 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .preload_registers: s6 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK-NEXT: - .offset: 20 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_x +; CHECK-NEXT: - .offset: 22 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_y +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_z +; CHECK-NEXT: - .offset: 26 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_x +; CHECK-NEXT: - .offset: 28 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_y +; CHECK-NEXT: - .offset: 30 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_z +; CHECK-NEXT: - .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .offset: 72 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_grid_dims +; CHECK-NEXT: - .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .group_segment_fixed_size: 0 +; CHECK-NEXT: .kernarg_segment_align: 8 +; CHECK-NEXT: .kernarg_segment_size: 264 +; CHECK-NEXT: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .name: test_preload_v6_block_count_xyz +; CHECK-NEXT: .private_segment_fixed_size: 0 +; CHECK-NEXT: .sgpr_count: 13 +; CHECK-NEXT: .sgpr_spill_count: 0 +; CHECK-NEXT: .symbol: test_preload_v6_block_count_xyz.kd +; CHECK-NEXT: .uses_dynamic_stack: false +; CHECK-NEXT: .vgpr_count: 4 +; CHECK-NEXT: .vgpr_spill_count: 0 +; CHECK-NEXT: .wavefront_size: 64 +; CHECK-NEXT: - .agpr_count: 0 +; CHECK-NEXT: .args: +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: out +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .preload_registers: 's[2:3]' +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .preload_registers: s4 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 12 +; CHECK-NEXT: .preload_registers: s5 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .preload_registers: s6 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK-NEXT: - .offset: 20 +; CHECK-NEXT: .preload_registers: s7 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_x +; CHECK-NEXT: - .offset: 22 +; CHECK-NEXT: .preload_registers: s7 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_y +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .preload_registers: s8 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_z +; CHECK-NEXT: - .offset: 26 +; CHECK-NEXT: .preload_registers: s8 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_x +; CHECK-NEXT: - .offset: 28 +; CHECK-NEXT: .preload_registers: s9 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_y +; CHECK-NEXT: - .offset: 30 +; CHECK-NEXT: .preload_registers: s9 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_z +; CHECK-NEXT: - .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .offset: 72 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_grid_dims +; CHECK-NEXT: - .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .group_segment_fixed_size: 0 +; CHECK-NEXT: .kernarg_segment_align: 8 +; CHECK-NEXT: .kernarg_segment_size: 264 +; CHECK-NEXT: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .name: test_preload_v6_block_count_z_workgroup_size_z_remainder_z +; CHECK-NEXT: .private_segment_fixed_size: 0 +; CHECK-NEXT: .sgpr_count: 16 +; CHECK-NEXT: .sgpr_spill_count: 0 +; CHECK-NEXT: .symbol: test_preload_v6_block_count_z_workgroup_size_z_remainder_z.kd +; CHECK-NEXT: .uses_dynamic_stack: false +; CHECK-NEXT: .vgpr_count: 4 +; CHECK-NEXT: .vgpr_spill_count: 0 +; CHECK-NEXT: .wavefront_size: 64 +; CHECK-NEXT: - .agpr_count: 0 +; CHECK-NEXT: .args: +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: out +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .preload_registers: 's[2:3]' +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .name: arg0 +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .preload_registers: s4 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .name: arg1 +; CHECK-NEXT: .offset: 10 +; CHECK-NEXT: .preload_registers: s4 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 20 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK-NEXT: - .offset: 28 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_x +; CHECK-NEXT: - .offset: 30 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_y +; CHECK-NEXT: - .offset: 32 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_z +; CHECK-NEXT: - .offset: 34 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_x +; CHECK-NEXT: - .offset: 36 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_y +; CHECK-NEXT: - .offset: 38 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_z +; CHECK-NEXT: - .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .offset: 80 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_grid_dims +; CHECK-NEXT: - .offset: 88 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .group_segment_fixed_size: 0 +; CHECK-NEXT: .kernarg_segment_align: 8 +; CHECK-NEXT: .kernarg_segment_size: 272 +; CHECK-NEXT: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .name: test_prelaod_v6_ptr1_i16_i16 +; CHECK-NEXT: .private_segment_fixed_size: 0 +; CHECK-NEXT: .sgpr_count: 11 +; CHECK-NEXT: .sgpr_spill_count: 0 +; CHECK-NEXT: .symbol: test_prelaod_v6_ptr1_i16_i16.kd +; CHECK-NEXT: .uses_dynamic_stack: false +; CHECK-NEXT: .vgpr_count: 2 +; CHECK-NEXT: .vgpr_spill_count: 0 +; CHECK-NEXT: .wavefront_size: 64 +; CHECK-NEXT: - .agpr_count: 0 +; CHECK-NEXT: .args: +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: out +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .preload_registers: 's[2:3]' +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .name: arg0 +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .preload_registers: s6 s7 s8 s9 +; CHECK-NEXT: .size: 16 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .offset: 32 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 36 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 40 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK-NEXT: - .offset: 44 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_x +; CHECK-NEXT: - .offset: 46 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_y +; CHECK-NEXT: - .offset: 48 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_z +; CHECK-NEXT: - .offset: 50 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_x +; CHECK-NEXT: - .offset: 52 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_y +; CHECK-NEXT: - .offset: 54 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_z +; CHECK-NEXT: - .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 88 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .offset: 96 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_grid_dims +; CHECK-NEXT: - .offset: 104 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .group_segment_fixed_size: 0 +; CHECK-NEXT: .kernarg_segment_align: 16 +; CHECK-NEXT: .kernarg_segment_size: 288 +; CHECK-NEXT: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .name: test_prelaod_v6_ptr1_v8i16 +; CHECK-NEXT: .private_segment_fixed_size: 0 +; CHECK-NEXT: .sgpr_count: 16 +; CHECK-NEXT: .sgpr_spill_count: 0 +; CHECK-NEXT: .symbol: test_prelaod_v6_ptr1_v8i16.kd +; CHECK-NEXT: .uses_dynamic_stack: false +; CHECK-NEXT: .vgpr_count: 5 +; CHECK-NEXT: .vgpr_spill_count: 0 +; CHECK-NEXT: .wavefront_size: 64 +; CHECK-NEXT: amdhsa.printf: +; CHECK-NEXT: - '1:1:4:%d\n' +; CHECK-NEXT: - '2:1:8:%g\n' +; CHECK-NEXT: amdhsa.target: amdgcn-amd-amdhsa--gfx942 +; CHECK-NEXT: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 3 + +@lds = external hidden addrspace(3) global [0 x i32], align 4 + +define amdgpu_kernel void @test_preload_v6( + i32 inreg %in, + ptr addrspace(1) inreg %r, + ptr addrspace(1) inreg %a, + ptr addrspace(1) inreg %b) #0 { + %a.val = load half, ptr addrspace(1) %a + %b.val = load half, ptr addrspace(1) %b + %r.val = fadd half %a.val, %b.val + store half %r.val, ptr addrspace(1) %r + store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4 + ret void +} + +define amdgpu_kernel void @test_preload_v6_block_count_xyz(ptr addrspace(1) inreg %out) #1 { + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0 + %load_x = load i32, ptr addrspace(4) %gep_x + %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 + %load_y = load i32, ptr addrspace(4) %gep_y + %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %load_z = load i32, ptr addrspace(4) %gep_z + %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_preload_v6_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #1 { + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 + %gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load0 = load i32, ptr addrspace(4) %gep0 + %load1 = load i16, ptr addrspace(4) %gep1 + %load2 = load i16, ptr addrspace(4) %gep2 + %conv1 = zext i16 %load1 to i32 + %conv2 = zext i16 %load2 to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_prelaod_v6_ptr1_i16_i16(ptr addrspace(1) inreg %out, i16 inreg %arg0, i16 inreg %arg1) #1 { + %ext = zext i16 %arg0 to i32 + %ext1 = zext i16 %arg1 to i32 + %add = add i32 %ext, %ext1 + store i32 %add, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_prelaod_v6_ptr1_v8i16(ptr addrspace(1) inreg %out, <8 x i16> inreg %arg0) #1 { + store <8 x i16> %arg0, ptr addrspace(1) %out, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdhsa_code_object_version", i32 600} +!llvm.printf.fmts = !{!1, !2} +!1 = !{!"1:1:4:%d\5Cn"} +!2 = !{!"2:1:8:%g\5Cn"} + +attributes #0 = { optnone noinline } +attributes #1 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll index 560b0e2c81cf2..0a5a7f92e41d8 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll index 0741ec4ffac42..3eb08bf75978b 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll index 08dd90250d0b4..600ef7b39d353 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll index a8340ddadaaf7..d7e9650ede5e8 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll index aefcfac23ff5d..230a54201b887 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll index 6005c31622405..c3b5e43160e05 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll index 328f56fb841b8..b3163b95c9110 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll index c50dd8b2fec7a..064d45a81c1c5 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll index fed493b630a4d..5043b94be58c2 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll index 60ff8b2dbb5eb..5936eaabdf890 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2 diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll index e04629a24209e..fe87f211be649 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s @@ -15,7 +15,8 @@ ; ASM: amdhsa.version: ; ASM: - 1 ; ASM4: - 1 -; ASM56: - 2 +; ASM5: - 2 +; ASM6: - 3 ; ELF: OS/ABI: AMDGPU_HSA (0x40) ; ELF4: ABIVersion: 2