diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a0c703d2df8a2..f4caaf426de6a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23186,6 +23186,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { } if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) { + // TODO: Check if shuffle mask is legal? + if (LegalOperations && TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VecVT) && + !VecOp.hasOneUse()) + return SDValue(); + SDValue InOp = SVInVec.getOperand(OrigElt); if (InOp.getValueType() != ScalarVT) { assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll index fafdac7ffac62..b242517b50841 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -176,8 +176,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -192,8 +191,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -566,16 +565,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -583,16 +581,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -675,31 +672,26 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -873,8 +865,7 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -887,8 +878,7 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -972,8 +962,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -989,8 +978,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1138,14 +1126,13 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1155,14 +1142,13 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1303,16 +1289,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1320,16 +1305,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1723,8 +1707,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1740,8 +1723,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1776,14 +1758,13 @@ define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,14 +1774,13 @@ define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2152,8 +2132,7 @@ define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2166,8 +2145,7 @@ define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2246,16 +2224,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2263,16 +2240,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2516,15 +2492,13 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2534,15 +2508,13 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2826,16 +2798,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2843,16 +2814,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2938,8 +2908,7 @@ define void @v_shuffle_v4f32_v2f32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2952,8 +2921,7 @@ define void @v_shuffle_v4f32_v2f32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3639,9 +3607,9 @@ define void @s_shuffle_v4f32_v2f32__3_3_3_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: s_mov_b32 s10, s9 ; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3656,9 +3624,9 @@ define void @s_shuffle_v4f32_v2f32__3_3_3_0() { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def s[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: s_mov_b32 s10, s9 ; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[8:11] ; GFX940-NEXT: ;;#ASMEND @@ -3709,35 +3677,19 @@ define void @s_shuffle_v4f32_v2f32__3_3_3_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: s_mov_b32 s11, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_3_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s1 -; GFX940-NEXT: s_mov_b32 s9, s1 -; GFX940-NEXT: s_mov_b32 s10, s1 -; GFX940-NEXT: s_mov_b32 s11, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX90APLUS-LABEL: s_shuffle_v4f32_v2f32__3_3_3_2: +; GFX90APLUS: ; %bb.0: +; GFX90APLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90APLUS-NEXT: ;;#ASMSTART +; GFX90APLUS-NEXT: ; def s[8:9] +; GFX90APLUS-NEXT: ;;#ASMEND +; GFX90APLUS-NEXT: s_mov_b32 s10, s9 +; GFX90APLUS-NEXT: s_mov_b32 s11, s8 +; GFX90APLUS-NEXT: s_mov_b32 s8, s9 +; GFX90APLUS-NEXT: ;;#ASMSTART +; GFX90APLUS-NEXT: ; use s[8:11] +; GFX90APLUS-NEXT: ;;#ASMEND +; GFX90APLUS-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> @@ -4178,9 +4130,9 @@ define void @s_shuffle_v4f32_v2f32__3_3_1_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -4195,9 +4147,9 @@ define void @s_shuffle_v4f32_v2f32__3_3_1_0() { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def s[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: s_mov_b32 s10, s1 ; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[8:11] ; GFX940-NEXT: ;;#ASMEND @@ -5081,15 +5033,14 @@ define void @s_shuffle_v4f32_v2f32__3_3_1_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[8:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -5099,15 +5050,14 @@ define void @s_shuffle_v4f32_v2f32__3_3_1_2() { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ; def s[8:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ; def s[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s3 -; GFX940-NEXT: s_mov_b32 s9, s3 ; GFX940-NEXT: s_mov_b32 s10, s1 -; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: s_mov_b32 s11, s8 +; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[8:11] ; GFX940-NEXT: ;;#ASMEND @@ -5482,5 +5432,3 @@ define void @s_shuffle_v4f32_v2f32__3_3_2_3() { call void asm sideeffect "; use $0", "{s[8:11]}"(<4 x float> %shuf) ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index c797ef338dee9..31c458f5338cb 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -1280,8 +1280,7 @@ define void @v_shuffle_v4f32_v3f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1296,8 +1295,7 @@ define void @v_shuffle_v4f32_v3f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1435,8 +1433,7 @@ define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1453,8 +1450,8 @@ define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1964,15 +1961,14 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1982,15 +1978,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2142,17 +2138,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2160,17 +2155,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3277,9 +3272,9 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3292,8 +3287,8 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3421,12 +3416,12 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3439,12 +3434,12 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3991,17 +3986,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4009,18 +4004,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 -; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4112,17 +4106,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4130,17 +4124,18 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4317,8 +4312,7 @@ define void @v_shuffle_v4f32_v3f32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4333,8 +4327,7 @@ define void @v_shuffle_v4f32_v3f32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -4858,17 +4851,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4876,17 +4868,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4972,31 +4964,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6094,9 +6083,9 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6113,8 +6102,8 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6252,9 +6241,9 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6267,8 +6256,8 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6727,17 +6716,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6745,17 +6733,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v3, v6 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6893,11 +6881,10 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6910,9 +6897,8 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll index 40d6ad90ab34b..e3427cd35c683 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll @@ -343,13 +343,12 @@ define void @v_shuffle_v4f32_v4f32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -364,9 +363,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -456,8 +454,7 @@ define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -472,8 +469,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -560,9 +557,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -573,9 +569,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -649,8 +644,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -662,8 +656,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1169,11 +1162,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1188,11 +1180,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1287,17 +1278,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1305,17 +1295,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v7 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1402,31 +1392,26 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1508,31 +1493,26 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1709,8 +1689,7 @@ define void @v_shuffle_v4f32_v4f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1724,8 +1703,7 @@ define void @v_shuffle_v4f32_v4f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1809,8 +1787,7 @@ define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1824,8 +1801,7 @@ define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1914,8 +1890,7 @@ define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1933,8 +1908,7 @@ define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2038,8 +2012,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2057,8 +2030,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2214,17 +2186,16 @@ define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,17 +2203,16 @@ define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2339,12 +2309,11 @@ define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2354,15 +2323,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2460,8 +2429,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2479,8 +2447,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2643,11 +2610,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2662,11 +2628,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2759,17 +2724,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2777,17 +2741,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2883,11 +2846,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2897,14 +2860,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3517,8 +3481,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3536,8 +3499,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -3573,17 +3535,16 @@ define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3591,17 +3552,16 @@ define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3697,12 +3657,11 @@ define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3712,15 +3671,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3818,8 +3777,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3837,8 +3795,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -4444,8 +4401,7 @@ define void @v_shuffle_v4f32_v4f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4458,8 +4414,7 @@ define void @v_shuffle_v4f32_v4f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4540,8 +4495,7 @@ define void @v_shuffle_v4f32_v4f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4554,8 +4508,7 @@ define void @v_shuffle_v4f32_v4f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4639,8 +4592,7 @@ define void @v_shuffle_v4f32_v4f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4656,8 +4608,8 @@ define void @v_shuffle_v4f32_v4f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4755,8 +4707,7 @@ define void @v_shuffle_v4f32_v4f32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4772,8 +4723,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4863,17 +4814,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4881,17 +4830,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5042,8 +4989,7 @@ define void @v_shuffle_v4f32_v4f32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5060,8 +5006,7 @@ define void @v_shuffle_v4f32_v4f32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5158,8 +5103,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5176,8 +5120,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5388,17 +5331,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5406,17 +5348,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5449,17 +5390,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5467,17 +5407,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5568,16 +5507,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5585,16 +5524,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v7 -; GFX940-NEXT: v_mov_b32_e32 v8, v5 -; GFX940-NEXT: v_mov_b32_e32 v9, v2 -; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6193,17 +6133,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6211,17 +6149,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6314,8 +6250,7 @@ define void @v_shuffle_v4f32_v4f32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6331,8 +6266,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6372,8 +6307,7 @@ define void @v_shuffle_v4f32_v4f32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6390,8 +6324,7 @@ define void @v_shuffle_v4f32_v4f32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6488,8 +6421,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6506,8 +6438,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7195,8 +7126,7 @@ define void @v_shuffle_v4f32_v4f32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7210,8 +7140,7 @@ define void @v_shuffle_v4f32_v4f32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7297,8 +7226,7 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7312,8 +7240,7 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7404,8 +7331,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] @@ -7423,8 +7349,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v6, v2 ; GFX940-NEXT: v_mov_b32_e32 v7, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 @@ -7523,8 +7448,7 @@ define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -7541,8 +7465,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -7686,31 +7610,28 @@ define void @v_shuffle_v4f32_v4f32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7904,11 +7825,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7918,14 +7839,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8022,10 +7944,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8040,10 +7961,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8076,9 +7997,8 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8090,9 +8010,8 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8703,12 +8622,10 @@ define void @v_shuffle_v4f32_v4f32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8718,16 +8635,14 @@ define void @v_shuffle_v4f32_v4f32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8822,8 +8737,7 @@ define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -8840,8 +8754,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -8938,8 +8852,7 @@ define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -8953,8 +8866,7 @@ define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -8985,31 +8897,28 @@ define void @v_shuffle_v4f32_v4f32__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_5_5: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9584,8 +9493,7 @@ define void @v_shuffle_v4f32_v4f32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9602,8 +9510,7 @@ define void @v_shuffle_v4f32_v4f32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9701,8 +9608,7 @@ define void @v_shuffle_v4f32_v4f32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9718,8 +9624,8 @@ define void @v_shuffle_v4f32_v4f32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, v6 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9799,8 +9705,7 @@ define void @v_shuffle_v4f32_v4f32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9813,8 +9718,7 @@ define void @v_shuffle_v4f32_v4f32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9897,8 +9801,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9911,8 +9814,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9998,8 +9900,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10016,8 +9917,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10116,8 +10016,7 @@ define void @v_shuffle_v4f32_v4f32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10133,8 +10032,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10222,31 +10121,26 @@ define void @v_shuffle_v4f32_v4f32__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_6_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10486,15 +10380,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10504,15 +10397,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_mov_b32_e32 v9, v4 -; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10610,10 +10503,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10628,10 +10520,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10714,31 +10606,26 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11246,8 +11133,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11264,8 +11150,7 @@ define void @v_shuffle_v4f32_v4f32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v5 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11364,8 +11249,7 @@ define void @v_shuffle_v4f32_v4f32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v6, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11381,8 +11265,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v6, v7 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11471,31 +11355,26 @@ define void @v_shuffle_v4f32_v4f32__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11573,8 +11452,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11587,8 +11465,7 @@ define void @v_shuffle_v4f32_v4f32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll index 86e8e2ed267dc..e2cfbe0e9e172 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll @@ -176,8 +176,7 @@ define void @v_shuffle_v4i32_v2i32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -192,8 +191,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -566,16 +565,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -583,16 +581,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -675,31 +672,26 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -876,8 +868,7 @@ define void @v_shuffle_v4i32_v2i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -890,8 +881,7 @@ define void @v_shuffle_v4i32_v2i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -978,8 +968,7 @@ define void @v_shuffle_v4i32_v2i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -995,8 +984,7 @@ define void @v_shuffle_v4i32_v2i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1144,14 +1132,13 @@ define void @v_shuffle_v4i32_v2i32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1161,14 +1148,13 @@ define void @v_shuffle_v4i32_v2i32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1309,16 +1295,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1326,16 +1311,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1729,8 +1713,7 @@ define void @v_shuffle_v4i32_v2i32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1746,8 +1729,7 @@ define void @v_shuffle_v4i32_v2i32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1782,14 +1764,13 @@ define void @v_shuffle_v4i32_v2i32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1799,14 +1780,13 @@ define void @v_shuffle_v4i32_v2i32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2158,8 +2138,7 @@ define void @v_shuffle_v4i32_v2i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2172,8 +2151,7 @@ define void @v_shuffle_v4i32_v2i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2252,16 +2230,15 @@ define void @v_shuffle_v4i32_v2i32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2269,16 +2246,15 @@ define void @v_shuffle_v4i32_v2i32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2522,15 +2498,13 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2540,15 +2514,13 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2832,16 +2804,15 @@ define void @v_shuffle_v4i32_v2i32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2849,16 +2820,15 @@ define void @v_shuffle_v4i32_v2i32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2944,8 +2914,7 @@ define void @v_shuffle_v4i32_v2i32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2958,8 +2927,7 @@ define void @v_shuffle_v4i32_v2i32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3645,9 +3613,9 @@ define void @s_shuffle_v4i32_v2i32__3_3_3_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: s_mov_b32 s10, s9 ; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3662,9 +3630,9 @@ define void @s_shuffle_v4i32_v2i32__3_3_3_0() { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def s[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: s_mov_b32 s10, s9 ; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[8:11] ; GFX940-NEXT: ;;#ASMEND @@ -3715,35 +3683,19 @@ define void @s_shuffle_v4i32_v2i32__3_3_3_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: s_mov_b32 s11, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_3_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s1 -; GFX940-NEXT: s_mov_b32 s9, s1 -; GFX940-NEXT: s_mov_b32 s10, s1 -; GFX940-NEXT: s_mov_b32 s11, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX90APLUS-LABEL: s_shuffle_v4i32_v2i32__3_3_3_2: +; GFX90APLUS: ; %bb.0: +; GFX90APLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90APLUS-NEXT: ;;#ASMSTART +; GFX90APLUS-NEXT: ; def s[8:9] +; GFX90APLUS-NEXT: ;;#ASMEND +; GFX90APLUS-NEXT: s_mov_b32 s10, s9 +; GFX90APLUS-NEXT: s_mov_b32 s11, s8 +; GFX90APLUS-NEXT: s_mov_b32 s8, s9 +; GFX90APLUS-NEXT: ;;#ASMSTART +; GFX90APLUS-NEXT: ; use s[8:11] +; GFX90APLUS-NEXT: ;;#ASMEND +; GFX90APLUS-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> @@ -4186,9 +4138,9 @@ define void @s_shuffle_v4i32_v2i32__3_3_1_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -4203,9 +4155,9 @@ define void @s_shuffle_v4i32_v2i32__3_3_1_0() { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def s[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: s_mov_b32 s10, s1 ; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[8:11] ; GFX940-NEXT: ;;#ASMEND @@ -5089,15 +5041,14 @@ define void @s_shuffle_v4i32_v2i32__3_3_1_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[8:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -5107,15 +5058,14 @@ define void @s_shuffle_v4i32_v2i32__3_3_1_2() { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ; def s[8:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ; def s[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s3 -; GFX940-NEXT: s_mov_b32 s9, s3 ; GFX940-NEXT: s_mov_b32 s10, s1 -; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: s_mov_b32 s11, s8 +; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[8:11] ; GFX940-NEXT: ;;#ASMEND @@ -5490,5 +5440,3 @@ define void @s_shuffle_v4i32_v2i32__3_3_2_3() { call void asm sideeffect "; use $0", "{s[8:11]}"(<4 x i32> %shuf) ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index 770c1c19087b4..1b003a7c5d9bc 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -1280,8 +1280,7 @@ define void @v_shuffle_v4i32_v3i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1296,8 +1295,7 @@ define void @v_shuffle_v4i32_v3i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1435,8 +1433,7 @@ define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1453,8 +1450,8 @@ define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1964,15 +1961,14 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1982,15 +1978,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2142,17 +2138,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2160,17 +2155,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3277,9 +3272,9 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3292,8 +3287,8 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3421,12 +3416,12 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3439,12 +3434,12 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3991,17 +3986,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4009,18 +4004,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 -; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4112,17 +4106,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4130,17 +4124,18 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4317,8 +4312,7 @@ define void @v_shuffle_v4i32_v3i32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4333,8 +4327,7 @@ define void @v_shuffle_v4i32_v3i32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -4858,17 +4851,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4876,17 +4868,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4972,31 +4964,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6094,9 +6083,9 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6113,8 +6102,8 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6252,9 +6241,9 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6267,8 +6256,8 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6727,17 +6716,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6745,17 +6733,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v3, v6 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6893,11 +6881,10 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6910,9 +6897,8 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll index 3d1bb1da92e09..47ad1c4bedb8b 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll @@ -343,13 +343,12 @@ define void @v_shuffle_v4i32_v4i32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -364,9 +363,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -456,8 +454,7 @@ define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -472,8 +469,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -560,9 +557,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -573,9 +569,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -649,8 +644,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -662,8 +656,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1169,11 +1162,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1188,11 +1180,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1287,17 +1278,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1305,17 +1295,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v7 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1402,31 +1392,26 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1508,31 +1493,26 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1709,8 +1689,7 @@ define void @v_shuffle_v4i32_v4i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1724,8 +1703,7 @@ define void @v_shuffle_v4i32_v4i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1809,8 +1787,7 @@ define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1824,8 +1801,7 @@ define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1914,8 +1890,7 @@ define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1933,8 +1908,7 @@ define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2038,8 +2012,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2057,8 +2030,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2214,17 +2186,16 @@ define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,17 +2203,16 @@ define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2339,12 +2309,11 @@ define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2354,15 +2323,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2460,8 +2429,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2479,8 +2447,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2643,11 +2610,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2662,11 +2628,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2759,17 +2724,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2777,17 +2741,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2883,11 +2846,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2897,14 +2860,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3517,8 +3481,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3536,8 +3499,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -3573,17 +3535,16 @@ define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3591,17 +3552,16 @@ define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3697,12 +3657,11 @@ define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3712,15 +3671,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3818,8 +3777,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3837,8 +3795,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -4444,8 +4401,7 @@ define void @v_shuffle_v4i32_v4i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4458,8 +4414,7 @@ define void @v_shuffle_v4i32_v4i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4540,8 +4495,7 @@ define void @v_shuffle_v4i32_v4i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4554,8 +4508,7 @@ define void @v_shuffle_v4i32_v4i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4639,8 +4592,7 @@ define void @v_shuffle_v4i32_v4i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4656,8 +4608,8 @@ define void @v_shuffle_v4i32_v4i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4755,8 +4707,7 @@ define void @v_shuffle_v4i32_v4i32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4772,8 +4723,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4863,17 +4814,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4881,17 +4830,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5042,8 +4989,7 @@ define void @v_shuffle_v4i32_v4i32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5060,8 +5006,7 @@ define void @v_shuffle_v4i32_v4i32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5158,8 +5103,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5176,8 +5120,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5388,17 +5331,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5406,17 +5348,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5449,17 +5390,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5467,17 +5407,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5568,16 +5507,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5585,16 +5524,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v7 -; GFX940-NEXT: v_mov_b32_e32 v8, v5 -; GFX940-NEXT: v_mov_b32_e32 v9, v2 -; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6193,17 +6133,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6211,17 +6149,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6314,8 +6250,7 @@ define void @v_shuffle_v4i32_v4i32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6331,8 +6266,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6372,8 +6307,7 @@ define void @v_shuffle_v4i32_v4i32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6390,8 +6324,7 @@ define void @v_shuffle_v4i32_v4i32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6488,8 +6421,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6506,8 +6438,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7195,8 +7126,7 @@ define void @v_shuffle_v4i32_v4i32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7210,8 +7140,7 @@ define void @v_shuffle_v4i32_v4i32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7297,8 +7226,7 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7312,8 +7240,7 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7404,8 +7331,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] @@ -7423,8 +7349,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v6, v2 ; GFX940-NEXT: v_mov_b32_e32 v7, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 @@ -7523,8 +7448,7 @@ define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -7541,8 +7465,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -7686,31 +7610,28 @@ define void @v_shuffle_v4i32_v4i32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7904,11 +7825,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7918,14 +7839,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8022,10 +7944,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8040,10 +7961,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8076,9 +7997,8 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8090,9 +8010,8 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8703,12 +8622,10 @@ define void @v_shuffle_v4i32_v4i32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8718,16 +8635,14 @@ define void @v_shuffle_v4i32_v4i32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8822,8 +8737,7 @@ define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -8840,8 +8754,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -8938,8 +8852,7 @@ define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -8953,8 +8866,7 @@ define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -8985,31 +8897,28 @@ define void @v_shuffle_v4i32_v4i32__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_5_5: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9584,8 +9493,7 @@ define void @v_shuffle_v4i32_v4i32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9602,8 +9510,7 @@ define void @v_shuffle_v4i32_v4i32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9701,8 +9608,7 @@ define void @v_shuffle_v4i32_v4i32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9718,8 +9624,8 @@ define void @v_shuffle_v4i32_v4i32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, v6 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9799,8 +9705,7 @@ define void @v_shuffle_v4i32_v4i32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9813,8 +9718,7 @@ define void @v_shuffle_v4i32_v4i32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9897,8 +9801,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9911,8 +9814,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9998,8 +9900,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10016,8 +9917,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10116,8 +10016,7 @@ define void @v_shuffle_v4i32_v4i32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10133,8 +10032,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10222,31 +10121,26 @@ define void @v_shuffle_v4i32_v4i32__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_6_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10486,15 +10380,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10504,15 +10397,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_mov_b32_e32 v9, v4 -; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10610,10 +10503,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10628,10 +10520,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10714,31 +10606,26 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11246,8 +11133,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11264,8 +11150,7 @@ define void @v_shuffle_v4i32_v4i32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v5 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11364,8 +11249,7 @@ define void @v_shuffle_v4i32_v4i32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v6, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11381,8 +11265,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v6, v7 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11471,31 +11355,26 @@ define void @v_shuffle_v4i32_v4i32__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11573,8 +11452,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11587,8 +11465,7 @@ define void @v_shuffle_v4i32_v4i32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll index 2dd794404158c..7a760e2da7dfc 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll @@ -176,8 +176,7 @@ define void @v_shuffle_v4p3_v2p3__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -192,8 +191,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -566,16 +565,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -583,16 +581,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -675,31 +672,26 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -876,8 +868,7 @@ define void @v_shuffle_v4p3_v2p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -890,8 +881,7 @@ define void @v_shuffle_v4p3_v2p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -978,8 +968,7 @@ define void @v_shuffle_v4p3_v2p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -995,8 +984,7 @@ define void @v_shuffle_v4p3_v2p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1144,14 +1132,13 @@ define void @v_shuffle_v4p3_v2p3__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1161,14 +1148,13 @@ define void @v_shuffle_v4p3_v2p3__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1309,16 +1295,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1326,16 +1311,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1729,8 +1713,7 @@ define void @v_shuffle_v4p3_v2p3__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1746,8 +1729,7 @@ define void @v_shuffle_v4p3_v2p3__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1782,14 +1764,13 @@ define void @v_shuffle_v4p3_v2p3__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1799,14 +1780,13 @@ define void @v_shuffle_v4p3_v2p3__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2158,8 +2138,7 @@ define void @v_shuffle_v4p3_v2p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2172,8 +2151,7 @@ define void @v_shuffle_v4p3_v2p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2252,16 +2230,15 @@ define void @v_shuffle_v4p3_v2p3__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2269,16 +2246,15 @@ define void @v_shuffle_v4p3_v2p3__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2522,15 +2498,13 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2540,15 +2514,13 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2832,16 +2804,15 @@ define void @v_shuffle_v4p3_v2p3__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2849,16 +2820,15 @@ define void @v_shuffle_v4p3_v2p3__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2944,8 +2914,7 @@ define void @v_shuffle_v4p3_v2p3__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2958,8 +2927,7 @@ define void @v_shuffle_v4p3_v2p3__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3645,9 +3613,9 @@ define void @s_shuffle_v4p3_v2p3__3_3_3_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: s_mov_b32 s10, s9 ; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3662,9 +3630,9 @@ define void @s_shuffle_v4p3_v2p3__3_3_3_0() { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def s[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: s_mov_b32 s10, s9 ; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[8:11] ; GFX940-NEXT: ;;#ASMEND @@ -3715,35 +3683,19 @@ define void @s_shuffle_v4p3_v2p3__3_3_3_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: s_mov_b32 s11, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_3_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s1 -; GFX940-NEXT: s_mov_b32 s9, s1 -; GFX940-NEXT: s_mov_b32 s10, s1 -; GFX940-NEXT: s_mov_b32 s11, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX90APLUS-LABEL: s_shuffle_v4p3_v2p3__3_3_3_2: +; GFX90APLUS: ; %bb.0: +; GFX90APLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90APLUS-NEXT: ;;#ASMSTART +; GFX90APLUS-NEXT: ; def s[8:9] +; GFX90APLUS-NEXT: ;;#ASMEND +; GFX90APLUS-NEXT: s_mov_b32 s10, s9 +; GFX90APLUS-NEXT: s_mov_b32 s11, s8 +; GFX90APLUS-NEXT: s_mov_b32 s8, s9 +; GFX90APLUS-NEXT: ;;#ASMSTART +; GFX90APLUS-NEXT: ; use s[8:11] +; GFX90APLUS-NEXT: ;;#ASMEND +; GFX90APLUS-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> @@ -4186,9 +4138,9 @@ define void @s_shuffle_v4p3_v2p3__3_3_1_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -4203,9 +4155,9 @@ define void @s_shuffle_v4p3_v2p3__3_3_1_0() { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def s[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: s_mov_b32 s10, s1 ; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[8:11] ; GFX940-NEXT: ;;#ASMEND @@ -5089,15 +5041,14 @@ define void @s_shuffle_v4p3_v2p3__3_3_1_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[8:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: s_mov_b32 s8, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -5107,15 +5058,14 @@ define void @s_shuffle_v4p3_v2p3__3_3_1_2() { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ; def s[8:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ; def s[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s3 -; GFX940-NEXT: s_mov_b32 s9, s3 ; GFX940-NEXT: s_mov_b32 s10, s1 -; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: s_mov_b32 s11, s8 +; GFX940-NEXT: s_mov_b32 s8, s9 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[8:11] ; GFX940-NEXT: ;;#ASMEND @@ -5490,5 +5440,3 @@ define void @s_shuffle_v4p3_v2p3__3_3_2_3() { call void asm sideeffect "; use $0", "{s[8:11]}"(<4 x ptr addrspace(3)> %shuf) ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index d6bd854283180..28bc61ce57815 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -1280,8 +1280,7 @@ define void @v_shuffle_v4p3_v3p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1296,8 +1295,7 @@ define void @v_shuffle_v4p3_v3p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1435,8 +1433,7 @@ define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1453,8 +1450,8 @@ define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1964,15 +1961,14 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1982,15 +1978,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2142,17 +2138,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2160,17 +2155,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3277,9 +3272,9 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3292,8 +3287,8 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3421,12 +3416,12 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3439,12 +3434,12 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3991,17 +3986,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4009,18 +4004,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 -; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4112,17 +4106,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4130,17 +4124,18 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4317,8 +4312,7 @@ define void @v_shuffle_v4p3_v3p3__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4333,8 +4327,7 @@ define void @v_shuffle_v4p3_v3p3__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -4858,17 +4851,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4876,17 +4868,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4972,31 +4964,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6094,9 +6083,9 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6113,8 +6102,8 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6252,9 +6241,9 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6267,8 +6256,8 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6727,17 +6716,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6745,17 +6733,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v3, v6 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6893,11 +6881,10 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6910,9 +6897,8 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll index e4c7cebb231bd..9cc1b9fe6cf0e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll @@ -343,13 +343,12 @@ define void @v_shuffle_v4p3_v4p3__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -364,9 +363,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -456,8 +454,7 @@ define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -472,8 +469,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -560,9 +557,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -573,9 +569,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -649,8 +644,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -662,8 +656,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1169,11 +1162,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1188,11 +1180,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1287,17 +1278,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1305,17 +1295,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v7 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1402,31 +1392,26 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1508,31 +1493,26 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1709,8 +1689,7 @@ define void @v_shuffle_v4p3_v4p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1724,8 +1703,7 @@ define void @v_shuffle_v4p3_v4p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1809,8 +1787,7 @@ define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1824,8 +1801,7 @@ define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1914,8 +1890,7 @@ define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1933,8 +1908,7 @@ define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2038,8 +2012,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2057,8 +2030,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2214,17 +2186,16 @@ define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,17 +2203,16 @@ define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2339,12 +2309,11 @@ define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2354,15 +2323,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2460,8 +2429,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2479,8 +2447,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2643,11 +2610,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2662,11 +2628,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2759,17 +2724,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2777,17 +2741,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2883,11 +2846,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2897,14 +2860,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3517,8 +3481,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3536,8 +3499,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -3573,17 +3535,16 @@ define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3591,17 +3552,16 @@ define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3697,12 +3657,11 @@ define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3712,15 +3671,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3818,8 +3777,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3837,8 +3795,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -4444,8 +4401,7 @@ define void @v_shuffle_v4p3_v4p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4458,8 +4414,7 @@ define void @v_shuffle_v4p3_v4p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4540,8 +4495,7 @@ define void @v_shuffle_v4p3_v4p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4554,8 +4508,7 @@ define void @v_shuffle_v4p3_v4p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4639,8 +4592,7 @@ define void @v_shuffle_v4p3_v4p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4656,8 +4608,8 @@ define void @v_shuffle_v4p3_v4p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4755,8 +4707,7 @@ define void @v_shuffle_v4p3_v4p3__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4772,8 +4723,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4863,17 +4814,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4881,17 +4830,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5042,8 +4989,7 @@ define void @v_shuffle_v4p3_v4p3__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5060,8 +5006,7 @@ define void @v_shuffle_v4p3_v4p3__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5158,8 +5103,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5176,8 +5120,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5388,17 +5331,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5406,17 +5348,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5449,17 +5390,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5467,17 +5407,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5568,16 +5507,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5585,16 +5524,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v7 -; GFX940-NEXT: v_mov_b32_e32 v8, v5 -; GFX940-NEXT: v_mov_b32_e32 v9, v2 -; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6193,17 +6133,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6211,17 +6149,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6314,8 +6250,7 @@ define void @v_shuffle_v4p3_v4p3__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6331,8 +6266,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6372,8 +6307,7 @@ define void @v_shuffle_v4p3_v4p3__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6390,8 +6324,7 @@ define void @v_shuffle_v4p3_v4p3__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6488,8 +6421,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6506,8 +6438,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7195,8 +7126,7 @@ define void @v_shuffle_v4p3_v4p3__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7210,8 +7140,7 @@ define void @v_shuffle_v4p3_v4p3__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7297,8 +7226,7 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7312,8 +7240,7 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7404,8 +7331,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] @@ -7423,8 +7349,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v6, v2 ; GFX940-NEXT: v_mov_b32_e32 v7, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 @@ -7523,8 +7448,7 @@ define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -7541,8 +7465,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -7686,31 +7610,28 @@ define void @v_shuffle_v4p3_v4p3__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7904,11 +7825,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7918,14 +7839,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8022,10 +7944,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8040,10 +7961,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8076,9 +7997,8 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8090,9 +8010,8 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8703,12 +8622,10 @@ define void @v_shuffle_v4p3_v4p3__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8718,16 +8635,14 @@ define void @v_shuffle_v4p3_v4p3__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v5 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8822,8 +8737,7 @@ define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -8840,8 +8754,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -8938,8 +8852,7 @@ define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -8953,8 +8866,7 @@ define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -8985,31 +8897,28 @@ define void @v_shuffle_v4p3_v4p3__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_5_5: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9584,8 +9493,7 @@ define void @v_shuffle_v4p3_v4p3__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9602,8 +9510,7 @@ define void @v_shuffle_v4p3_v4p3__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9701,8 +9608,7 @@ define void @v_shuffle_v4p3_v4p3__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9718,8 +9624,8 @@ define void @v_shuffle_v4p3_v4p3__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, v6 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9799,8 +9705,7 @@ define void @v_shuffle_v4p3_v4p3__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9813,8 +9718,7 @@ define void @v_shuffle_v4p3_v4p3__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9897,8 +9801,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9911,8 +9814,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9998,8 +9900,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10016,8 +9917,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10116,8 +10016,7 @@ define void @v_shuffle_v4p3_v4p3__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10133,8 +10032,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10222,31 +10121,26 @@ define void @v_shuffle_v4p3_v4p3__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_6_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10486,15 +10380,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10504,15 +10397,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_mov_b32_e32 v9, v4 -; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10610,10 +10503,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10628,10 +10520,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10714,31 +10606,26 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, v1 -; GFX940-NEXT: v_mov_b32_e32 v7, v2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11246,8 +11133,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11264,8 +11150,7 @@ define void @v_shuffle_v4p3_v4p3__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v4, v5 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11364,8 +11249,7 @@ define void @v_shuffle_v4p3_v4p3__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v6, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11381,8 +11265,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, v7 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v6, v7 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11471,31 +11355,26 @@ define void @v_shuffle_v4p3_v4p3__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_mov_b32_e32 v6, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11573,8 +11452,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11587,8 +11465,7 @@ define void @v_shuffle_v4p3_v4p3__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0)