diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index bda3b066b7763..13ec0954b545a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2206,11 +2206,18 @@ class AMDGPUWaveReduce : Intrinsic< def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; +// i32 llvm.amdgcn.readfirstlane(i32) def int_amdgcn_readfirstlane : ClangBuiltin<"__builtin_amdgcn_readfirstlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// ty llvm.amdgcn.readfirstlane2(ty) +// A type-generic version of readfirstlane. +def int_amdgcn_readfirstlane2 : + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + // The lane argument must be uniform across the currently active threads of the // current wave. Otherwise, the result is undefined. def int_amdgcn_readlane : diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f283af6fa07d3..04e8f77d2fa32 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5453,6 +5453,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LDS) NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD) NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD) + NODE_NAME_CASE(READFIRSTLANE) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(LOAD_D16_HI) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index f10a357125e56..4c22f8586ec23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -541,6 +541,8 @@ enum NodeType : unsigned { FPTRUNC_ROUND_UPWARD, FPTRUNC_ROUND_DOWNWARD, + READFIRSTLANE, + DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, LOAD_D16_HI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 82f58ea38fd0a..560474613adf3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -342,6 +342,8 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; +def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", SDTIntUnaryOp>; + // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ SDTCisInt<0>, // i8 tgt @@ -504,3 +506,7 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_perm node:$src0, node:$src1, node:$src2), (AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>; + +def AMDGPUreadfirstlane : PatFrags<(ops node:$src), + [(int_amdgcn_readfirstlane node:$src), + (AMDGPUreadfirstlane_impl node:$src)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 410dc83d45c57..60a2127e4991d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -393,6 +393,7 @@ def UniformIntrinsics : GenericTable { } def : AlwaysUniform; +def : AlwaysUniform; def : AlwaysUniform; def : AlwaysUniform; def : AlwaysUniform; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0a4370de0613b..a2985582a89de 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -238,7 +238,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote); AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16); } - setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); @@ -8452,6 +8451,26 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_addrspacecast_nonnull: return lowerADDRSPACECAST(Op, DAG); + case Intrinsic::amdgcn_readfirstlane2: + if (VT.getSizeInBits() <= 32) { + MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits()); + return DAG.getBitcast( + VT, DAG.getAnyExtOrTrunc( + DAG.getNode(AMDGPUISD::READFIRSTLANE, DL, MVT::i32, + DAG.getAnyExtOrTrunc( + DAG.getBitcast(IntVT, Op.getOperand(1)), DL, + MVT::i32)), + DL, IntVT)); + } + if (VT.getSizeInBits() % 32 == 0) { + MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); + return DAG.getBitcast( + VT, DAG.UnrollVectorOp( + DAG.getNode(AMDGPUISD::READFIRSTLANE, DL, VecVT, + DAG.getBitcast(VecVT, Op.getOperand(1))) + .getNode())); + } + return SDValue(); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 1c942dcefdace..8ec74354fafa6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3405,7 +3405,7 @@ def : GCNPat< // FIXME: Should also do this for readlane, but tablegen crashes on // the ignored src1. def : GCNPat< - (int_amdgcn_readfirstlane (i32 imm:$src)), + (AMDGPUreadfirstlane (i32 imm:$src)), (S_MOV_B32 SReg_32:$src) >; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 2341e0d9d32bb..809cdde14bbe3 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -243,7 +243,7 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> { // FIXME: Specify SchedRW for READFIRSTLANE_B32 // TODO: There is VOP3 encoding also def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE, - getVOP1Pat.ret, 1> { let isConvergent = 1; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 0284f44f5f14d..60fbbb63878d0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -1,65 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 - -; CHECK-LABEL: {{^}}test_readfirstlane: -; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 -define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 { +define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) { +; CHECK-LABEL: test_readfirstlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: flat_store_dword v[0:1], v2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_readfirstlane_imm: -; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32 -; CHECK-NOT: [[SGPR_VAL]] -; CHECK: ; use [[SGPR_VAL]] -define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) { +; CHECK-LABEL: test_readfirstlane_imm: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s0, 32 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_endpgm %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } -; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold: -; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32 -; CHECK-NOT: [[VVAL]] -; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] -define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) { +; CHECK-LABEL: test_readfirstlane_imm_fold: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v2, 32 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: flat_store_dword v[0:1], v2 +; CHECK-NEXT: s_endpgm %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_readfirstlane_m0: -; CHECK: s_mov_b32 m0, -1 -; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0 -; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] -define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { +; CHECK-LABEL: test_readfirstlane_m0: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v2, m0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: flat_store_dword v[0:1], v2 +; CHECK-NEXT: s_endpgm %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr: -; CHECK: ;;#ASMSTART -; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]] -; CHECK: ;;#ASMEND -; CHECK-NOT: [[SGPR]] -; CHECK-NOT: readfirstlane -; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]] -; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]] -define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) { +; CHECK-LABEL: test_readfirstlane_copy_from_sgpr: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: flat_store_dword v[0:1], v2 +; CHECK-NEXT: s_endpgm %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } -; Make sure this doesn't crash. -; CHECK-LABEL: {{^}}test_readfirstlane_fi: -; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0 -define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) { +; CHECK-LABEL: test_readfirstlane_fi: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s4 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_endpgm %alloca = alloca i32, addrspace(5) %int = ptrtoint ptr addrspace(5) %alloca to i32 %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int) @@ -67,5 +98,156 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { ret void } -attributes #0 = { nounwind readnone convergent } -attributes #1 = { nounwind } +define void @test_readfirstlane2_i32(ptr addrspace(1) %out, i32 %src) { +; CHECK-LABEL: test_readfirstlane2_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s4 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %src) + call void asm sideeffect "; use $0", "s"(i32 %x) + ret void +} + +define void @test_readfirstlane2_i64(ptr addrspace(1) %out, i64 %src) { +; CHECK-LABEL: test_readfirstlane2_i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[4:5] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call i64 @llvm.amdgcn.readfirstlane2.i64(i64 %src) + call void asm sideeffect "; use $0", "s"(i64 %x) + ret void +} + +define void @test_readfirstlane2_v7i32(ptr addrspace(1) %out, <7 x i32> %src) { +; CHECK-LABEL: test_readfirstlane2_v7i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[4:10] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call <7 x i32> @llvm.amdgcn.readfirstlane2.v7i32(<7 x i32> %src) + call void asm sideeffect "; use $0", "s"(<7 x i32> %x) + ret void +} + +define void @test_readfirstlane2_f16(ptr addrspace(1) %out, half %src) { +; CHECK-LABEL: test_readfirstlane2_f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s4 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call half @llvm.amdgcn.readfirstlane2.f16(half %src) + call void asm sideeffect "; use $0", "s"(half %x) + ret void +} + +define void @test_readfirstlane2_v2f16(ptr addrspace(1) %out, <2 x half> %src) { +; CHECK-LABEL: test_readfirstlane2_v2f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s4 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x half> @llvm.amdgcn.readfirstlane2.v2f16(<2 x half> %src) + call void asm sideeffect "; use $0", "s"(<2 x half> %x) + ret void +} + +define void @test_readfirstlane2_bf16(ptr addrspace(1) %out, bfloat %src) { +; CHECK-LABEL: test_readfirstlane2_bf16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s4 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call bfloat @llvm.amdgcn.readfirstlane2.bf16(bfloat %src) + call void asm sideeffect "; use $0", "s"(bfloat %x) + ret void +} + +define void @test_readfirstlane2_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %src) { +; CHECK-LABEL: test_readfirstlane2_v4bf16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[4:5] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call <4 x bfloat> @llvm.amdgcn.readfirstlane2.v4bf16(<4 x bfloat> %src) + call void asm sideeffect "; use $0", "s"(<4 x bfloat> %x) + ret void +} + +define void @test_readfirstlane2_float(ptr addrspace(1) %out, float %src) { +; CHECK-LABEL: test_readfirstlane2_float: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s4 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call float @llvm.amdgcn.readfirstlane2.f32(float %src) + call void asm sideeffect "; use $0", "s"(float %x) + ret void +} + +define void @test_readfirstlane2_p0(ptr addrspace(1) %out, ptr %src) { +; CHECK-LABEL: test_readfirstlane2_p0: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[4:5] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call ptr @llvm.amdgcn.readfirstlane2.p0(ptr %src) + call void asm sideeffect "; use $0", "s"(ptr %x) + ret void +} + +define void @test_readfirstlane2_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) { +; CHECK-LABEL: test_readfirstlane2_v3p0: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[4:9] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr> @llvm.amdgcn.readfirstlane2.v3p0(<3 x ptr> %src) + call void asm sideeffect "; use $0", "s"(<3 x ptr> %x) + ret void +}