diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 275d0193452a5..5fcbf810abcbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5580,6 +5580,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, return false; LLT PartialResTy = LLT::scalar(SplitSize); + bool NeedsBitcast = false; if (Ty.isVector()) { LLT EltTy = Ty.getElementType(); unsigned EltSize = EltTy.getSizeInBits(); @@ -5588,8 +5589,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, } else if (EltSize == 16 || EltSize == 32) { unsigned NElem = SplitSize / EltSize; PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem)); + } else { + // Handle all other cases via S32/S64 pieces + NeedsBitcast = true; } - // Handle all other cases via S32/S64 pieces; } SmallVector PartialRes; @@ -5615,7 +5618,12 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy)); } - B.buildMergeLikeInstr(DstReg, PartialRes); + if (NeedsBitcast) + B.buildBitcast(DstReg, B.buildMergeLikeInstr( + LLT::scalar(Ty.getSizeInBits()), PartialRes)); + else + B.buildMergeLikeInstr(DstReg, PartialRes); + MI.eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index db557ff23c085..693e0ebd0280c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -9398,3 +9398,1015 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr store <8 x i16> %v, ptr addrspace(1) %out ret void } + +define void @v_permlane16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v2i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v2i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v2i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v2i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v2i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v2i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlane16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v3i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v3i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v3i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v3i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v3i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v3i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlane16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v4f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v10 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v11 +; GFX10-SDAG-NEXT: v_permlane16_b32 v9, v9, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v4f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v10 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v11 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v9, v9, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v4f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v4f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v4f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v4f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlane16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <4 x double> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v8f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v18 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v19 +; GFX10-SDAG-NEXT: v_permlane16_b32 v17, v17, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v16, v16, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v15, v15, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v14, v14, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v13, v13, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v12, v12, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v11, v11, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v10, v10, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v9, v9, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v8f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v18 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v19 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v9, v9, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v10, v10, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v11, v11, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v12, v12, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v13, v13, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v14, v14, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v15, v15, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v16, v16, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v17, v17, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v8f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x3 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v8f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x3 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v8f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x3 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v8f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x3 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlane16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <8 x double> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v2i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v2i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v2i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v2i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v2i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v2i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlanex16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v3i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v3i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v3i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v3i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v3i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v3i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlanex16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v4f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v10 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v11 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v9, v9, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v4f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v10 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v11 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v9, v9, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v4f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v4f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v4f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v4f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlanex16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <4 x double> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v8f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v18 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v19 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v17, v17, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v16, v16, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v15, v15, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v14, v14, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v13, v13, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v12, v12, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v11, v11, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v10, v10, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v9, v9, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v8f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v18 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v19 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v9, v9, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v10, v10, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v11, v11, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v12, v12, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v13, v13, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v14, v14, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v15, v15, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v16, v16, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v17, v17, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v8f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x3 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v8f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x3 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v8f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x3 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v8f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x3 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlanex16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <8 x double> %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index f23f9595446eb..6698d360aff4c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL %s declare i32 @llvm.amdgcn.permlane64(i32) declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { +define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { ; GFX11-LABEL: test_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 @@ -17,12 +17,93 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 %src0) +; GFX11-SDAG-LABEL: test_s_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { +define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX11-SDAG-LABEL: test_s_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { +; GFX11-SDAG-LABEL: test_s_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double %src0) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) { ; GFX11-LABEL: test_i: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 @@ -32,12 +113,115 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 99) +; GFX11-SDAG-LABEL: test_i_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 99) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { +define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane64.f32(float 1234.5) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 99) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double 1234.5) + store double %v, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-LABEL: test_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 @@ -47,11 +231,430 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_v_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %tidx) store i32 %v, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-GISEL: {{.*}} -; GFX11-SDAG: {{.*}} + +define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlane64.f32(float %tidx_f32) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 { +; GFX11-SDAG-LABEL: test_v_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %tidx_i64) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlane64.f64(double %tidx_f64) + store double %v, ptr addrspace(1) %out + ret void +} + +define void @test_half(ptr addrspace(1) %out, half %src0) { +; GFX11-SDAG-LABEL: test_half: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_half: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlane64.f16(half %src0) + store half %v, ptr addrspace(1) %out + ret void +} + +define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) { +; GFX11-SDAG-LABEL: test_bfloat: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_bfloat: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0) + store bfloat %v, ptr addrspace(1) %out + ret void +} + +define void @test_i16(ptr addrspace(1) %out, i16 %src0) { +; GFX11-SDAG-LABEL: test_i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0) + store i16 %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) { +; GFX11-SDAG-LABEL: test_v2f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v2f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0) + store <2 x half> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) { +; GFX11-SDAG-LABEL: test_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0) + store <2 x float> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) { +; GFX11-SDAG-LABEL: test_v7i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v7i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane64.v7i32(<7 x i32> %src0) + store <7 x i32> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) { +; GFX11-SDAG-LABEL: test_v8i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v8i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0) + store <8 x i16> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2i64(ptr addrspace(1) %out, <2 x i64> %src0) { +; GFX11-SDAG-LABEL: test_v2i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v2i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlane64.v2i64(<2 x i64> %src0) + store <2 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v3i64(ptr addrspace(1) %out, <3 x i64> %src0) { +; GFX11-SDAG-LABEL: test_v3i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v3i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlane64.v3i64(<3 x i64> %src0) + store <3 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v4f64(ptr addrspace(1) %out, <4 x double> %src0) { +; GFX11-SDAG-LABEL: test_v4f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9 +; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v4f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 +; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlane64.v4f64(<4 x double> %src0) + store <4 x double> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v8f64(ptr addrspace(1) %out, <8 x double> %src0) { +; GFX11-SDAG-LABEL: test_v8f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v17, v17 +; GFX11-SDAG-NEXT: v_permlane64_b32 v16, v16 +; GFX11-SDAG-NEXT: v_permlane64_b32 v15, v15 +; GFX11-SDAG-NEXT: v_permlane64_b32 v14, v14 +; GFX11-SDAG-NEXT: v_permlane64_b32 v13, v13 +; GFX11-SDAG-NEXT: v_permlane64_b32 v12, v12 +; GFX11-SDAG-NEXT: v_permlane64_b32 v11, v11 +; GFX11-SDAG-NEXT: v_permlane64_b32 v10, v10 +; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9 +; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x3 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v8f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 +; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9 +; GFX11-GISEL-NEXT: v_permlane64_b32 v10, v10 +; GFX11-GISEL-NEXT: v_permlane64_b32 v11, v11 +; GFX11-GISEL-NEXT: v_permlane64_b32 v12, v12 +; GFX11-GISEL-NEXT: v_permlane64_b32 v13, v13 +; GFX11-GISEL-NEXT: v_permlane64_b32 v14, v14 +; GFX11-GISEL-NEXT: v_permlane64_b32 v15, v15 +; GFX11-GISEL-NEXT: v_permlane64_b32 v16, v16 +; GFX11-GISEL-NEXT: v_permlane64_b32 v17, v17 +; GFX11-GISEL-NEXT: s_clause 0x3 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlane64.v8f64(<8 x double> %src0) + store <8 x double> %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 55fa02a0c582c..a369b33562d6c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -163,30 +163,157 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) { ret void } -; FIXME: Broken -; define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) { -; %x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src) -; call void asm sideeffect "; use $0", "s"(<2 x i64> %x) -; ret void -; } +define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v2i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v2i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src) + call void asm sideeffect "; use $0", "s"(<2 x i64> %x) + ret void +} -; define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) { -; %x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src) -; call void asm sideeffect "; use $0", "s"(<3 x i64> %x) -; ret void -; } +define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v3i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v3i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src) + call void asm sideeffect "; use $0", "s"(<3 x i64> %x) + ret void +} -; define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) { -; %x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src) -; call void asm sideeffect "; use $0", "s"(<4 x i64> %x) -; ret void -; } +define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v4i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v4i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src) + call void asm sideeffect "; use $0", "s"(<4 x i64> %x) + ret void +} -; define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) { -; %x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src) -; call void asm sideeffect "; use $0", "s"(<8 x i64> %x) -; ret void -; } +define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v8i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v8i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src) + call void asm sideeffect "; use $0", "s"(<8 x i64> %x) + ret void +} define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index edb6ebcee1325..8306ef2e78b3c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -894,6 +894,174 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src ret void } +define void @test_readlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v2i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v2i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x i64> @llvm.amdgcn.readlane.v2i64(<2 x i64> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<2 x i64> %x) + ret void +} + +define void @test_readlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v3i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v3i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v8 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s9 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x i64> @llvm.amdgcn.readlane.v3i64(<3 x i64> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<3 x i64> %x) + ret void +} + +define void @test_readlane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v4f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v10 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v4f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v10 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s11 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <4 x double> @llvm.amdgcn.readlane.v4f64(<4 x double> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<4 x double> %x) + ret void +} + +define void @test_readlane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v8f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v18 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s19, v17, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s18, v16, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s17, v15, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s16, v14, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s15, v13, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s14, v12, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s13, v11, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s12, v10, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v8f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v18 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s12, v10, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s13, v11, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s14, v12, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s15, v13, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s16, v14, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s17, v15, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s18, v16, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s19, v17, s19 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <8 x double> @llvm.amdgcn.readlane.v4f64(<8 x double> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<8 x double> %x) + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 04d179478590b..59eb23ef65792 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -2687,6 +2687,804 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr ret void } +define void @test_writelane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v2i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[7:10], v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s8, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v2i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[7:10], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s4, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s8, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v2i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b128 v[7:10], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s4, s1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[7:10], off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v2i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[7:10], v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v7, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v8, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v9, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v10, s8, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v2i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[7:10], v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v7, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v8, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s8, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v2i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b128 v[7:10], v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v7, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v8, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s4, s1 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[7:10], off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <2 x i64>, ptr addrspace(1) %out + %writelane = call <2 x i64> @llvm.amdgcn.writelane.v2i64(<2 x i64> %src, i32 %src1, <2 x i64> %oldval) + store <2 x i64> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v3i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_add_u32_e32 v13, vcc, 16, v0 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[9:12], v[0:1] +; GFX802-SDAG-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dwordx2 v[15:16], v[13:14] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v6 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s8, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s10, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s6, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[13:14], v[15:16] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v3i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: global_load_dwordx2 v[13:14], v[0:1], off offset:16 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[9:12], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v6 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s9, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s10, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[13:14], off offset:16 +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v3i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_load_b64 v[13:14], v[0:1], off offset:16 +; GFX1100-SDAG-NEXT: global_load_b128 v[9:12], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v8 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s5, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s6, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[9:12], off +; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[13:14], off offset:16 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v3i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_add_u32_e32 v17, vcc, 16, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[9:12], v[0:1] +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[13:16], v[17:18] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v8 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v9, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v10, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v11, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v12, s8, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v13, s9, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v14, s10, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, v13 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, v14 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[17:18], v[2:3] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v3i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[9:12], v[0:1], off +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[13:16], v[0:1], off offset:16 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v7 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s4, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s9, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s10, s5 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, v13 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v3, v14 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16 +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v3i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_load_b128 v[9:12], v[0:1], off +; GFX1100-GISEL-NEXT: global_load_b128 v[13:16], v[0:1], off offset:16 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v8 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v5 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s0, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s5, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s6, s1 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, v13 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, v14 +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[9:12], off +; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off offset:16 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <3 x i64>, ptr addrspace(1) %out + %writelane = call <3 x i64> @llvm.amdgcn.writelane.v2i64(<3 x i64> %src, i32 %src1, <3 x i64> %oldval) + store <3 x i64> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v4f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1] +; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v10 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s12, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v14, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v13, s10, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s11, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s12, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v18, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v17, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v16, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s8, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v4f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[15:18], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v18, s9, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v17, s10, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s11, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s12, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[11:14], off offset:16 +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v4f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_load_b128 v[11:14], v[0:1], off offset:16 +; GFX1100-SDAG-NEXT: global_load_b128 v[15:18], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v18, s5, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v17, s6, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s7, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s8, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[15:18], off +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[11:14], off offset:16 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v4f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_add_u32_e32 v19, vcc, 16, v0 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[11:14], v[0:1] +; GFX802-GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[15:18], v[19:20] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v9 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v11, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v12, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v13, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v14, s8, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v15, s9, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v16, s10, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v17, s11, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v18, s12, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v4f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[11:14], v[0:1], off +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:16 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s9, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s10, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v17, s11, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v18, s12, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off offset:16 +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v4f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_load_b128 v[11:14], v[0:1], off +; GFX1100-GISEL-NEXT: global_load_b128 v[15:18], v[0:1], off offset:16 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s5, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s6, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v17, s7, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v18, s8, s1 +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[11:14], off +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[15:18], off offset:16 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <4 x double>, ptr addrspace(1) %out + %writelane = call <4 x double> @llvm.amdgcn.writelane.v4f64(<4 x double> %src, i32 %src1, <4 x double> %oldval) + store <4 x double> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v8f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v18 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[0:1] +; GFX802-SDAG-NEXT: v_add_u32_e32 v22, vcc, 16, v0 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[2:5], v[22:23] +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v15 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v14 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s12, v13 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s13, v12 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s14, v11 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s15, v10 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v16 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v21, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v20, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v19, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v18, s8, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[18:21] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8 +; GFX802-SDAG-NEXT: v_add_u32_e32 v18, vcc, 32, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[18:19] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v17 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(3) +; GFX802-SDAG-NEXT: v_writelane_b32 v5, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v3, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, s7, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s8, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s12, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v14, s13, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v13, s14, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s15, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s10, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s11, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[22:23], v[2:5] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v8f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_clause 0x3 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:16 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[23:26], v[0:1], off +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:48 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:32 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v18 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s17, v13 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s18, v12 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s19, v11 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s20, v10 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s13, v17 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s14, v16 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s15, v15 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s16, v14 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(3) +; GFX1010-SDAG-NEXT: v_writelane_b32 v22, s4, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(2) +; GFX1010-SDAG-NEXT: v_writelane_b32 v26, s9, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1010-SDAG-NEXT: v_writelane_b32 v30, s13, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v34, s17, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v33, s18, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v32, s19, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v31, s20, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v29, s14, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v28, s15, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v27, s16, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v25, s10, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v24, s11, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v23, s12, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v21, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v20, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v19, s8, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[31:34], off offset:32 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[27:30], off offset:48 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[19:22], off offset:16 +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v8f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_clause 0x3 +; GFX1100-SDAG-NEXT: global_load_b128 v[19:22], v[0:1], off offset:16 +; GFX1100-SDAG-NEXT: global_load_b128 v[23:26], v[0:1], off +; GFX1100-SDAG-NEXT: global_load_b128 v[27:30], v[0:1], off offset:48 +; GFX1100-SDAG-NEXT: global_load_b128 v[31:34], v[0:1], off offset:32 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v18 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s13, v13 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s14, v12 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s15, v11 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s16, v10 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s9, v17 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s10, v16 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s11, v15 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s12, v14 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(3) +; GFX1100-SDAG-NEXT: v_writelane_b32 v22, s0, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(2) +; GFX1100-SDAG-NEXT: v_writelane_b32 v26, s5, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v30, s9, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v34, s13, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v33, s14, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v32, s15, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v31, s16, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v29, s10, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v28, s11, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v27, s12, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v25, s6, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v24, s7, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v23, s8, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v21, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v20, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v19, s4, s1 +; GFX1100-SDAG-NEXT: s_clause 0x3 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[31:34], off offset:32 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[27:30], off offset:48 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[23:26], off +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[19:22], off offset:16 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v8f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v18 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[18:21], v[0:1] +; GFX802-GISEL-NEXT: v_add_u32_e32 v22, vcc, 16, v0 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[2:5], v[22:23] +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v11 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v12 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v13 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v14 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s13, v15 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s14, v16 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s15, v17 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v18, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v19, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v20, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v21, s8, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[18:21] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX802-GISEL-NEXT: v_add_u32_e32 v18, vcc, 32, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v8 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v9 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v10 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[6:9], v[18:19] +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[10:13], v[0:1] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v3, s5, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v5, s7, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v6, s8, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v7, s9, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v8, s10, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v9, s11, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v10, s12, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v11, s13, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v12, s14, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v13, s15, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[22:23], v[2:5] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[18:19], v[6:9] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v8f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_clause 0x3 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[19:22], v[0:1], off +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:16 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:32 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:48 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v18 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s13, v10 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s14, v11 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s15, v12 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s16, v13 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s17, v14 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s18, v15 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s19, v16 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s20, v17 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX1010-GISEL-NEXT: v_writelane_b32 v19, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v20, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v21, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v22, s8, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX1010-GISEL-NEXT: v_writelane_b32 v23, s9, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v24, s10, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v25, s11, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v26, s12, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1010-GISEL-NEXT: v_writelane_b32 v27, s13, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v28, s14, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v29, s15, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v30, s16, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v31, s17, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v32, s18, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v33, s19, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v34, s20, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off offset:16 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off offset:32 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off offset:48 +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v8f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_clause 0x3 +; GFX1100-GISEL-NEXT: global_load_b128 v[19:22], v[0:1], off +; GFX1100-GISEL-NEXT: global_load_b128 v[23:26], v[0:1], off offset:16 +; GFX1100-GISEL-NEXT: global_load_b128 v[27:30], v[0:1], off offset:32 +; GFX1100-GISEL-NEXT: global_load_b128 v[31:34], v[0:1], off offset:48 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v18 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s9, v10 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s10, v11 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s11, v12 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s12, v13 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s13, v14 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s14, v15 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s15, v16 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s16, v17 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX1100-GISEL-NEXT: v_writelane_b32 v19, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v20, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v21, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v22, s4, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX1100-GISEL-NEXT: v_writelane_b32 v23, s5, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v24, s6, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v25, s7, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v26, s8, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v27, s9, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v28, s10, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v29, s11, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v30, s12, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v31, s13, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v32, s14, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v33, s15, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v34, s16, s1 +; GFX1100-GISEL-NEXT: s_clause 0x3 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[19:22], off +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[23:26], off offset:16 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[27:30], off offset:32 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[31:34], off offset:48 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <8 x double>, ptr addrspace(1) %out + %writelane = call <8 x double> @llvm.amdgcn.writelane.v8f64(<8 x double> %src, i32 %src1, <8 x double> %oldval) + store <8 x double> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent }