diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 65d049ed9a0aa..a52c79f4302d1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1211,16 +1211,81 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); } -static bool inlineAsmUsesAGPRs(const InlineAsm *IA) { - for (const auto &CI : IA->ParseConstraints()) { +/// Compute the minimum number of AGPRs required to allocate the inline asm. +static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA, + const CallBase &Call) { + unsigned ArgNo = 0; + unsigned ResNo = 0; + unsigned AGPRDefCount = 0; + unsigned AGPRUseCount = 0; + unsigned MaxPhysReg = 0; + const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout(); + + // TODO: Overestimates due to not accounting for tied operands + for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) { + Type *Ty = nullptr; + switch (CI.Type) { + case InlineAsm::isOutput: { + Ty = Call.getType(); + if (auto *STy = dyn_cast(Ty)) + Ty = STy->getElementType(ResNo); + ++ResNo; + break; + } + case InlineAsm::isInput: { + Ty = Call.getArgOperand(ArgNo++)->getType(); + break; + } + case InlineAsm::isLabel: + continue; + case InlineAsm::isClobber: + // Parse the physical register reference. + break; + } + for (StringRef Code : CI.Codes) { - Code.consume_front("{"); - if (Code.starts_with("a")) - return true; + unsigned RegCount = 0; + if (Code.starts_with("a")) { + // Virtual register, compute number of registers based on the type. + // + // We ought to be going through TargetLowering to get the number of + // registers, but we should avoid the dependence on CodeGen here. + RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32); + } else { + // Physical register reference + auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code); + if (Kind == 'a') { + RegCount = NumRegs; + MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u)); + } + + continue; + } + + if (CI.Type == InlineAsm::isOutput) { + // Apply tuple alignment requirement + // + // TODO: This is more conservative than necessary. + AGPRDefCount = alignTo(AGPRDefCount, RegCount); + + AGPRDefCount += RegCount; + if (CI.isEarlyClobber) { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } + } else { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } } } - return false; + unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount); + + // TODO: This is overly conservative. If there are any physical registers, + // allocate any virtual registers after them so we don't have to solve optimal + // packing. + return std::min(MaxVirtReg + MaxPhysReg, 256u); } // TODO: Migrate to range merge of amdgpu-agpr-alloc. @@ -1259,7 +1324,7 @@ struct AAAMDGPUNoAGPR : public StateWrapper { const Function *Callee = dyn_cast(CalleeOp); if (!Callee) { if (const InlineAsm *IA = dyn_cast(CalleeOp)) - return !inlineAsmUsesAGPRs(IA); + return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0; return false; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll index 664dfa21759cf..b8126caa8b80c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll @@ -1,103 +1,166 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s +; Shrink result attribute list by preventing use of most attributes. +define internal void @use_most() { +; CHECK-LABEL: define internal void @use_most( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [256 x i8], align 1, addrspace(5) +; CHECK-NEXT: [[ALLOCA_CAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cluster.id.x() +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.cluster.id.y() +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cluster.id.z() +; CHECK-NEXT: [[TMP7:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; CHECK-NEXT: [[TMP8:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.dispatch.id() +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[IMPLICIT_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr [[ALLOCA_CAST]], ptr addrspace(4) [[IMPLICIT_ARG_PTR]], i64 256, i1 false) +; CHECK-NEXT: ret void +; + %alloca = alloca [256 x i8], addrspace(5) + %alloca.cast = addrspacecast ptr addrspace(5) %alloca to ptr + call i32 @llvm.amdgcn.workitem.id.x() + call i32 @llvm.amdgcn.workitem.id.y() + call i32 @llvm.amdgcn.workitem.id.z() + call i32 @llvm.amdgcn.workgroup.id.x() + call i32 @llvm.amdgcn.workgroup.id.y() + call i32 @llvm.amdgcn.workgroup.id.z() + call i32 @llvm.amdgcn.cluster.id.x() + call i32 @llvm.amdgcn.cluster.id.y() + call i32 @llvm.amdgcn.cluster.id.z() + call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + call ptr addrspace(4) @llvm.amdgcn.queue.ptr() + call i64 @llvm.amdgcn.dispatch.id() + call i32 @llvm.amdgcn.lds.kernel.id() + %implicit.arg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + call void @llvm.memcpy.p0.p4(ptr %alloca.cast, ptr addrspace(4) %implicit.arg.ptr, i64 256, i1 false) + ret void +} + define amdgpu_kernel void @kernel_uses_asm_virtreg() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void asm sideeffect "; use $0", "a"(i32 poison) + call void @use_most() ret void } define amdgpu_kernel void @kernel_uses_asm_virtreg_def() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: [[DEF:%.*]] = call i32 asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; %def = call i32 asm sideeffect "; def $0", "=a"() + call void @use_most() ret void } define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: [[DEF:%.*]] = call i64 asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; %def = call i64 asm sideeffect "; def $0", "={a[0:1]}"() + call void @use_most() ret void } define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void asm sideeffect "; use $0", "v,a"(i32 poison, i32 poison) + call void @use_most() ret void } define amdgpu_kernel void @kernel_uses_non_agpr_asm() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_non_agpr_asm( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void asm sideeffect "; use $0", "v"(i32 poison) + call void @use_most() ret void } define amdgpu_kernel void @kernel_uses_asm_physreg() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void asm sideeffect "; use $0", "{a0}"(i32 poison) + call void @use_most() ret void } define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_tuple( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison) + call void @use_most() ret void } define void @func_uses_asm_virtreg_agpr() { ; CHECK-LABEL: define void @func_uses_asm_virtreg_agpr( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void asm sideeffect "; use $0", "a"(i32 poison) + call void @use_most() ret void } define void @func_uses_asm_physreg_agpr() { ; CHECK-LABEL: define void @func_uses_asm_physreg_agpr( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void asm sideeffect "; use $0", "{a0}"(i32 poison) + call void @use_most() ret void } define void @func_uses_asm_physreg_agpr_tuple() { ; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison) + call void @use_most() ret void } @@ -105,99 +168,119 @@ declare void @unknown() define amdgpu_kernel void @kernel_calls_extern() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern( -; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void @unknown() + call void @use_most() ret void } define amdgpu_kernel void @kernel_calls_extern_marked_callsite() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite( -; CHECK-SAME: ) #[[ATTR2]] { -; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]] +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @unknown() #[[ATTR5:[0-9]+]] +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void @unknown() #0 + call void @use_most() ret void } define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect( -; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] { +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: call void [[INDIRECT]]() +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void %indirect() + call void @use_most() ret void } define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite( -; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] { -; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]] +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR5]] +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void %indirect() #0 + call void @use_most() ret void } define amdgpu_kernel void @kernel_transitively_uses_agpr_asm() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_transitively_uses_agpr_asm( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void @func_uses_asm_physreg_agpr() +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void @func_uses_asm_physreg_agpr() + call void @use_most() ret void } define void @empty() { ; CHECK-LABEL: define void @empty( -; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; + call void @use_most() ret void } define void @also_empty() { ; CHECK-LABEL: define void @also_empty( -; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; + call void @use_most() ret void } define amdgpu_kernel void @kernel_calls_empty() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_empty( -; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: call void @empty() +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void @empty() + call void @use_most() ret void } define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void @empty() ; CHECK-NEXT: call void @func_uses_asm_physreg_agpr() +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void @empty() call void @func_uses_asm_physreg_agpr() + call void @use_most() ret void } define amdgpu_kernel void @kernel_calls_generic_intrinsic(ptr %ptr0, ptr %ptr1, i64 %size) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_generic_intrinsic( -; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] { +; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[PTR0]], ptr [[PTR1]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p0.p0.i64(ptr %ptr0, ptr %ptr1, i64 %size, i1 false) + call void @use_most() ret void } @@ -205,31 +288,35 @@ declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float> define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32( -; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]], <32 x float> [[C:%.*]]) #[[ATTR1]] { +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]], <32 x float> [[C:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[RESULT:%.*]] = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float [[A]], float [[B]], <32 x float> [[C]], i32 0, i32 0, i32 0) ; CHECK-NEXT: store <32 x float> [[RESULT]], ptr addrspace(1) [[OUT]], align 128 +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; %result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0) store <32 x float> %result, ptr addrspace(1) %out + call void @use_most() ret void } define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_workitem_id_x( -; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; %result = call i32 @llvm.amdgcn.workitem.id.x() store i32 %result, ptr addrspace(1) %out + call void @use_most() ret void } define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr( -; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty ; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]] @@ -244,21 +331,382 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { ; CHECK: 5: ; CHECK-NEXT: unreachable ; CHECK: 6: +; CHECK-NEXT: call void @use_most() ; CHECK-NEXT: ret void ; %fptr = select i1 %cond, ptr @empty, ptr @also_empty call void %fptr() + call void @use_most() ret void } +define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call { i32, i32 } asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call {i32, i32} asm sideeffect "; def $0", "=a,=a"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=a"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=v"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "a"(ptr poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call ptr asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call ptr asm sideeffect "; def $0", "=a"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call <2 x ptr> asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call <2 x ptr> asm sideeffect "; def $0", "=a"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call { i32, i32 } asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call {i32, i32} asm sideeffect "; def $0", "={a0},={a[4:5]}"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_clobber() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; clobber $0", "~{a4}"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_clobber_tuple() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_tuple( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; clobber $0", "~{a[10:13]}"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_clobber_oob() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_oob( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; clobber $0", "~{a256}"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_clobber_max() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_max( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; clobber $0", "~{a255}"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_physreg_oob() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_oob( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "{a256}"(i32 poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call <32 x i32> asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call <32 x i32> asm sideeffect "; def $0", "=a"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "a"(<32 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call <32 x i32> asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call <32 x i32> asm sideeffect "; use $0", "=a,a"(<32 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @vreg_use_exceeds_register_file() { +; CHECK-LABEL: define amdgpu_kernel void @vreg_use_exceeds_register_file( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "a"(<257 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @vreg_def_exceeds_register_file() { +; CHECK-LABEL: define amdgpu_kernel void @vreg_def_exceeds_register_file( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call <257 x i32> asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call <257 x i32> asm sideeffect "; def $0", "=a"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @multiple() { +; CHECK-LABEL: define amdgpu_kernel void @multiple( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call { <16 x i32>, <8 x i32>, <8 x i32> } asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call {<16 x i32>, <8 x i32>, <8 x i32>} asm sideeffect "; def $0", "=a,=a,=a,a,a,a"(<4 x i32> splat (i32 0), <8 x i32> splat (i32 1), i64 999) + call void @use_most() + ret void +} + +define amdgpu_kernel void @earlyclobber_0() { +; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_0( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call <8 x i32> asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call <8 x i32> asm sideeffect "; def $0", "=&a,a"(i32 0) + call void @use_most() + ret void +} + +define amdgpu_kernel void @earlyclobber_1() { +; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_1( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[DEF:%.*]] = call { <8 x i32>, <16 x i32> } asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + %def = call { <8 x i32>, <16 x i32 > } asm sideeffect "; def $0, $1", "=&a,=&a,a,a"(i32 0, <16 x i32> splat (i32 1)) + call void @use_most() + ret void +} + +define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512() { +; CHECK-LABEL: define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0, $1, $2", "{a16},a,a"(i32 poison, <8 x i32> poison, <16 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512() { +; CHECK-LABEL: define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = call { i32, <8 x i32>, <16 x i32> } asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call {i32, <8 x i32>, <16 x i32>} asm sideeffect "; def $0, $1, $2", "={a16},=a,=a"() + call void @use_most() + ret void +} + +define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256() { +; CHECK-LABEL: define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = call { i32, <16 x i32> } asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call {i32, <16 x i32>} asm sideeffect "; def $0, $1, $2", "={a16},=a,a"(<8 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @mixed_physreg_vreg_tuples_0() { +; CHECK-LABEL: define amdgpu_kernel void @mixed_physreg_vreg_tuples_0( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0, $1", "{a[1:4]},a"(<4 x i32> poison, <4 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @mixed_physreg_vreg_tuples_1() { +; CHECK-LABEL: define amdgpu_kernel void @mixed_physreg_vreg_tuples_1( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0, $1", "a,{a[0:3]}"(<4 x i32> poison, <4 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @physreg_raises_limit() { +; CHECK-LABEL: define amdgpu_kernel void @physreg_raises_limit( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0, $1", "a,{a[5:8]}"(<4 x i32> poison, <4 x i32> poison) + call void @use_most() + ret void +} + +; FIXME: This should require 9. We cannot allocate an a128 at a0. +define amdgpu_kernel void @physreg_tuple_alignment_raises_limit() { +; CHECK-LABEL: define amdgpu_kernel void @physreg_tuple_alignment_raises_limit( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0, $1", "a,{a[1:4]}"(<4 x i32> poison, <4 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @align3_virtreg() { +; CHECK-LABEL: define amdgpu_kernel void @align3_virtreg( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0, $1", "a,a"(<3 x i32> poison, <3 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @align3_align4_virtreg() { +; CHECK-LABEL: define amdgpu_kernel void @align3_align4_virtreg( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0, $1", "a,a"(<3 x i32> poison, <4 x i32> poison) + call void @use_most() + ret void +} + +define amdgpu_kernel void @align2_align4_virtreg() { +; CHECK-LABEL: define amdgpu_kernel void @align2_align4_virtreg( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: call void @use_most() +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0, $1", "a,a"(<2 x i32> poison, <4 x i32> poison) + call void @use_most() + ret void +} attributes #0 = { "amdgpu-agpr-alloc"="0" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" } ;.