diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4ff761ec19b3c..06aa8c4ad06b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -51,6 +51,9 @@ FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsLegacyPass(); FunctionPass *createSIFormMemoryClausesLegacyPass(); +FunctionPass *createAMDGPUConvertWaveSizeLegacyPass(const GCNTargetMachine *); +void initializeAMDGPUConvertWaveSizeLegacyPass(PassRegistry &); +extern char &AMDGPUConvertWaveSizeLegacyID; FunctionPass *createSIPostRABundlerPass(); FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *); @@ -174,6 +177,9 @@ extern char &SIShrinkInstructionsLegacyID; void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &); extern char &SIFixSGPRCopiesLegacyID; +void initializeAMDGPUConvertWaveSizeLegacyPass(PassRegistry &); +extern char &AMDGPUConvertWaveSizeLegacyID; + void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &); extern char &SIFixVGPRCopiesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp new file mode 100644 index 0000000000000..1ee86c437610d --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp @@ -0,0 +1,312 @@ +//===- SIConvertWaveSize.cpp ----------------------------------------------===// +// +// Automatically converts wave32 kernels to wave64 +// +// Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +/// \file +// Small short living kernels may become waveslot limited. +// To work around the problem an optimization is proposed to convert such +// kernels from wave32 to wave64 automatically.These kernels shall conform to a +// strict set of limitations and satisfy profitability conditions. +// +// 1. A kernel shall have no function calls as we cannot analyze call stack +// requirements (nor will it fall into a category of short living kernels +// anyway). +// 2. A kernel itself shall not be called from a device enqueue call. +// 3. A kernel shall not attempt to access EXEC or VCC in any user visible +// way. +// 4. A kernel must not use readlane/readfirstlane or any cross-lane/DPP +// operations in general. +// 5. A kernel shall not read wavefront size or use ballot through +// intrinsics (a use of pre-defined frontend wave size macro was deemed +// permissible for now). +// 6. There shall be no atomic operations of any sort as these may be used +// for cross-thread communication. +// 7. There shall be no LDS access as the allocation is usually tied to the +// workgroup size and we generally cannot extend it. It is also changing +// occupancy which is tied to the wave size. +// 8. There shall be no inline asm calls. +// 9 .There shall be no dynamic VGPRs. +// 10 .Starting from GFX11 some instructions (such as WMMA on GFX11+ and +// transpose loads on GFX12+) work differently (have different operands) in +// wave32 and wave64. The kernel shall not have intrinsics to invoke such +// instructions. + +#include "AMDGPUConvertWaveSize.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-convert-wave-size" + +namespace { +class AMDGPUConvertWaveSize { + const GCNTargetMachine *TM; + const LoopInfo *LI; + ScalarEvolution *SE; + TargetTransformInfo *TTI; + + InstructionCost TotalCost = 0; + + static const unsigned MaxLatency = 2000; + + SmallVector Callees; + +public: + AMDGPUConvertWaveSize(const GCNTargetMachine *TM, const LoopInfo *LI, + ScalarEvolution *SE, TargetTransformInfo *TTI) + : TM(TM), LI(LI), SE(SE), TTI(TTI) {} + + bool run(Function &F); +}; + +class AMDGPUConvertWaveSizeLegacy : public FunctionPass { + const GCNTargetMachine *TM; + +public: + static char ID; + AMDGPUConvertWaveSizeLegacy(const GCNTargetMachine *TM) : FunctionPass(ID), TM(TM) {} + bool runOnFunction(Function &F) override { + auto &LI = getAnalysis().getLoopInfo(); + auto &SE = getAnalysis().getSE(); + auto &TTI = getAnalysis().getTTI(F); + AMDGPUConvertWaveSize Impl(TM, &LI, &SE, &TTI); + return Impl.run(F); + } + StringRef getPassName() const override { return "AMDGPU convert wave size"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + FunctionPass::getAnalysisUsage(AU); + } +}; +} // end anonymous namespace + +void printFunctionAttributes(const Function &F) { + LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n"); + for (const auto &Attr : F.getAttributes()) { + LLVM_DEBUG(dbgs() << " Attribute: " << Attr.getAsString() << "\n"); + } +} + +bool AMDGPUConvertWaveSize::run(Function &F) { + + // Check if the function is a kernel. + if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) + return false; + + const GCNSubtarget &ST = TM->getSubtarget(F); + if (!ST.isWave32()) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel is not wave32.\n"); + return false; + } + + for (const auto &Arg : F.args()) { + if (Arg.getType()->isPointerTy() && + Arg.getType()->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel argument " << Arg + << " points to LDS object\n"); + return false; + } + } + + // Check for static LDS uses + const Module *M = F.getParent(); + for (const GlobalVariable &GV : M->globals()) { + if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + + for (auto User : GV.users()) { + if (auto UseI = dyn_cast(User)) { + if (UseI->getFunction() == &F) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Global variable " << GV + << " points to LDS object and is used\n"); + return false; + } + } + } + } + + // Check if the kernel can be called via device enqueue. + if (F.hasAddressTaken()) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel address is taken.\n"); + return false; + } + + // Check if a trip count is a compile time constant for all loops in the + // kernel + for (Loop *L : *LI) { + const SCEV *TripCountSCEV = SE->getBackedgeTakenCount(L); + if (!isa(TripCountSCEV)) { + LLVM_DEBUG( + dbgs() << "AMDGPUConvertWaveSize: Trip count is not a compile time " + "constant.\n"); + return false; + } + } + + for (const auto &BB : F) { + InstructionCost BlockCost = 0; + for (const auto &I : BB) { + + // Atomic operations are not allowed. + if (I.isAtomic()) { + LLVM_DEBUG( + dbgs() << "AMDGPUConvertWaveSize: Atomic operation detected.\n"); + return false; + } + + if (const CallBase *CB = dyn_cast(&I)) { + // FIXME: Any calls are not allowed. Only non-converged intrinsic calls + // and amdgsn_s_barrier are exempt. InlineAsm is checked separately + // for debug purposes. This will be changed in the final version. + if (CB->isInlineAsm()) { + // Inline assembly is not allowed. + LLVM_DEBUG(dbgs() + << "AMDGPUConvertWaveSize: Inline assembly detected.\n"); + return false; + } + + if (Function *Callee = CB->getCalledFunction()) { + // assuming readlane/readfirstlane or any cross-lane/DPP + // operations have "let isConvergent = 1" in IntrinsicsAMDGPU.td + if (Callee->isIntrinsic()) { + if (Callee->hasFnAttribute(Attribute::Convergent)) { + if (Callee->getIntrinsicID() != Intrinsic::amdgcn_s_barrier) { + // TODO: what else should go in a "white list" ? + LLVM_DEBUG(dbgs() + << "AMDGPUConvertWaveSize: Convergent intrinsic " + << Callee->getName() << " detected.\n"); + return false; + } + } + + if (Callee->getIntrinsicID() == Intrinsic::read_register || + Callee->getIntrinsicID() == Intrinsic::write_register) { + + LLVM_DEBUG(dbgs() + << "AMDGPUConvertWaveSize: read/write_register " + "intrinsic detected.\n"); + return false; + } + + // Save callee as a candidate for attribute change + Callees.push_back(Callee); + } + } else { + // General calls are not allowed. + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: function call detected.\n"); + return false; + } + } + // No LDS access is allowed + + // We already ensured we have no LDS pointers passed as arguments. + // Now take care of those cast from flat or global + + // Bail out early, before we come across the LDS addres use. + if (const auto AC = dyn_cast(&I)) { + if (AC->getDestTy()->getPointerAddressSpace() == + AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG( + dbgs() + << "AMDGPUConvertWaveSize: addrspacecast to LDS detected.\n"); + return false; + } + } + + if (const auto I2P = dyn_cast(&I)) { + if (I2P->getDestTy()->isPointerTy() && + I2P->getDestTy()->getPointerAddressSpace() == + AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: convertion int to LDS " + "pointer detected.\n"); + return false; + } + } + + // TODO: Dynamic VGPRS and GFX11+ special operations ??? + + BlockCost += + TTI->getInstructionCost(&I, TargetTransformInfo::TCK_Latency); + } + if (auto L = LI->getLoopFor(&BB)) { + const SCEV *TripCount = SE->getBackedgeTakenCount(L); + if (auto *C = dyn_cast(TripCount)) { + uint64_t TC = C->getValue()->getZExtValue() + 1; + size_t Depth = LI->getLoopDepth(&BB); + BlockCost *= TC * Depth; + } else + llvm_unreachable("AMDGPUConvertWaveSize: only loops with compile time " + "constant trip count could reach here!\n"); + } + TotalCost += BlockCost; + if (TotalCost.isValid()) { + if (TotalCost.getValue().value() >= MaxLatency) { + LLVM_DEBUG( + dbgs() << "AMDGPUConvertWaveSize: Total latency of the kernel [" + << TotalCost.getValue().value() + << "] exceeds the limit of 2000 cycles - not profitable!\n"); + return false; + } + } else + llvm_unreachable( + "AMDGPUConvertWaveSize: Cost model error - invalid state!\n"); + } + + // Additional checks can be added here... + + // If all checks pass, convert wave size from wave32 to wave64. + F.addFnAttr("target-features", "+wavefrontsize64"); + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for " + << F.getName() << " from wave32 to wave64.\n"); + // Now take care of the intrinsic calls + for (auto C : Callees) { + C->addFnAttr("target-features", "+wavefrontsize64"); + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for " + << C->getName() << " from wave32 to wave64.\n"); + } + + return true; +} + +//===----------------------------------------------------------------------===// +// Pass registration +//===----------------------------------------------------------------------===// + +INITIALIZE_PASS_BEGIN(AMDGPUConvertWaveSizeLegacy, DEBUG_TYPE, "AMDGPU convert wave size", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(AMDGPUConvertWaveSizeLegacy, DEBUG_TYPE, "AMDGPU convert wave size", + false, false) + +char AMDGPUConvertWaveSizeLegacy::ID = 0; + +char &llvm::AMDGPUConvertWaveSizeLegacyID = AMDGPUConvertWaveSizeLegacy::ID; + +FunctionPass *llvm::createAMDGPUConvertWaveSizeLegacyPass(const GCNTargetMachine *TM) { + return new AMDGPUConvertWaveSizeLegacy(TM); +} + +PreservedAnalyses AMDGPUConvertWaveSizePass::run( + Function &F, FunctionAnalysisManager &FAM) { + auto &LI = FAM.getResult(F); + auto &SE = FAM.getResult(F); + auto &TTI = FAM.getResult(F); + + AMDGPUConvertWaveSize Impl(TM, &LI, &SE, &TTI); + bool Changed = Impl.run(F); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.h b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.h new file mode 100644 index 0000000000000..e5b8c92c0b656 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.h @@ -0,0 +1,31 @@ +//===- SIConvertWaveSize.h ----------------------------------------*- C++- *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCONVERTWAVESIZE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCONVERTWAVESIZE_H + +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AMDGPUConvertWaveSizePass : public PassInfoMixin { + /// The target machine. + const GCNTargetMachine *TM; + +public: + AMDGPUConvertWaveSizePass(const GCNTargetMachine &TM) + : TM(&TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCONVERTWAVESIZE_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 98a1147ef6d66..b953ba8e77599 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast(this))) +FUNCTION_PASS("amdgpu-convert-wave-size", AMDGPUConvertWaveSizePass(*static_cast(this))) #undef FUNCTION_PASS #ifndef FUNCTION_ANALYSIS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b6cc5137d711a..f2e7adebf2786 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -44,6 +44,7 @@ #include "R600TargetMachine.h" #include "SIFixSGPRCopies.h" #include "SIFixVGPRCopies.h" +#include "AMDGPUConvertWaveSize.h" #include "SIFoldOperands.h" #include "SIFormMemoryClauses.h" #include "SILoadStoreOptimizer.h" @@ -506,6 +507,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILowerSGPRSpillsLegacyPass(*PR); initializeSIFixSGPRCopiesLegacyPass(*PR); initializeSIFixVGPRCopiesLegacyPass(*PR); + initializeAMDGPUConvertWaveSizeLegacyPass(*PR); initializeSIFoldOperandsLegacyPass(*PR); initializeSIPeepholeSDWALegacyPass(*PR); initializeSIShrinkInstructionsLegacyPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..fccdd47151593 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -150,6 +150,7 @@ add_llvm_target(AMDGPUCodeGen SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp SIFixVGPRCopies.cpp + AMDGPUConvertWaveSize.cpp SIFoldOperands.cpp SIFormMemoryClauses.cpp SIFrameLowering.cpp diff --git a/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll new file mode 100644 index 0000000000000..f43dc3235a05d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll @@ -0,0 +1,148 @@ +; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-convert-wave-size < %s | FileCheck %s + +define amdgpu_kernel void @test_not_wave32(ptr addrspace(1) %out) #0 { + ; CHECK: @test_not_wave32{{.*}}) #0 + %gep = getelementptr i32, ptr addrspace(1) %out, i32 2 + %tmp = load i32, ptr addrspace(1) %gep + store i32 %tmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @intr_non_convergent(ptr addrspace(1) nocapture %arg) #1 { + ; CHECK: @intr_non_convergent{{.*}} #0 +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() + %tmp1 = icmp ugt i32 %tmp, 32 + %tmp2 = select i1 %tmp1, i32 2, i32 1 + store i32 %tmp2, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @intr_convergent(ptr addrspace(1) nocapture %arg, i32 %X) #1 { + ; CHECK: @intr_convergent{{.*}}) #1 +bb: + %tmp = icmp ugt i32 %X, 32 + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %tmp) + store i32 %ballot, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_barrier(ptr addrspace(1) %in, ptr addrspace(1) %out) #1 { + ; CHECK: @test_barrier{{.*}}) #0 +entry: + %val = load <2 x half>, ptr addrspace(1) %in + call void @llvm.amdgcn.s.barrier() #2 + store <2 x half> %val, ptr addrspace(1) %out + ret void +} + + +define amdgpu_kernel void @test_read_exec(ptr addrspace(1) %out) #1 { + ; CHECK: @test_read_exec{{.*}}) #1 + %exec = call i64 @llvm.read_register.i64(metadata !0) + store i64 %exec, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_read_vcc_lo(ptr addrspace(1) %out) #1 { + ; CHECK: @test_read_vcc_lo{{.*}}) #1 + %vcc_lo = call i32 @llvm.read_register.i32(metadata !1) + store i32 %vcc_lo, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_read_vcc_hi(ptr addrspace(1) %out) #1 { + ; CHECK: @test_read_vcc_hi{{.*}}) #1 + %vcc_hi = call i32 @llvm.read_register.i32(metadata !2) + store i32 %vcc_hi, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_lds_access(ptr addrspace(3) %out) #1 { + ; CHECK: @test_lds_access{{.*}}) #1 + %gep = getelementptr i32, ptr addrspace(3) %out, i32 2 + %tmp = load i32, ptr addrspace(3) %gep + store i32 %tmp, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @test_addrspacecast_to_lds(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +entry: + %gep = getelementptr i32, ptr addrspace(1) %in, i32 16 + %ptr = addrspacecast ptr addrspace(1) %gep to ptr addrspace(3) + %val = load i32, ptr addrspace(3) %ptr + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_bitcast_to_lds_ptr(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +entry: + %gep = getelementptr i32, ptr addrspace(1) %in, i32 16 + %lds = inttoptr i32 0 to ptr addrspace(3) + %val = load i32, ptr addrspace(3) %lds + store i32 %val, ptr addrspace(1) %out + ret void +} + +@lds = addrspace(3) global [256 x i32] zeroinitializer + +define amdgpu_kernel void @test_use_global_lds_object(ptr addrspace(1) %out, i1 %p) #0 { + %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10 + %ld = load i32, ptr addrspace(3) %gep + store i32 %ld, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_simple_loop(ptr addrspace(1) nocapture %arg) #1 { + ; CHECK: @test_simple_loop{{.*}}) #1 +bb: + br label %bb2 + +bb1: + ret void + +bb2: + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +define amdgpu_kernel void @test_nested_loop(ptr addrspace(1) nocapture %arg) #1 { + ; CHECK: @test_nested_loop{{.*}}) #1 +bb: + br label %bb2 + +bb1: + ret void + +bb2: + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 8 + br label %bb3 + +bb3: + %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ] + %tmp5 = add nuw nsw i32 %tmp4, 1 + %tmp6 = icmp eq i32 %tmp5, 128 + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp6, label %bb4, label %bb3 + +bb4: + br i1 %tmp3, label %bb1, label %bb2 +} + +declare void @llvm.amdgcn.s.sleep(i32) +declare i32 @llvm.amdgcn.wavefrontsize() +declare i32 @llvm.amdgcn.ballot.i32(i1) +declare i32 @llvm.read_register.i32(metadata) +declare i64 @llvm.read_register.i64(metadata) + +attributes #0 = { nounwind "target-features"="+wavefrontsize64" } +attributes #1 = { nounwind "target-features"="+wavefrontsize32" } + +!0 = !{!"exec"} +!1 = !{!"vcc_lo"} +!2 = !{!"vcc_hi"}